tigerbeetle-node 0.5.2 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -78
- package/dist/benchmark.js +96 -94
- package/dist/benchmark.js.map +1 -1
- package/dist/index.d.ts +82 -82
- package/dist/index.js +74 -93
- package/dist/index.js.map +1 -1
- package/dist/test.js +134 -111
- package/dist/test.js.map +1 -1
- package/package.json +3 -2
- package/scripts/download_node_headers.sh +3 -1
- package/src/benchmark.ts +114 -118
- package/src/index.ts +102 -111
- package/src/node.zig +55 -63
- package/src/test.ts +146 -125
- package/src/tigerbeetle/scripts/benchmark.bat +46 -0
- package/src/tigerbeetle/scripts/benchmark.sh +5 -0
- package/src/tigerbeetle/scripts/install_zig.bat +109 -109
- package/src/tigerbeetle/scripts/install_zig.sh +8 -4
- package/src/tigerbeetle/scripts/vopr.bat +47 -47
- package/src/tigerbeetle/scripts/vopr.sh +2 -2
- package/src/tigerbeetle/src/benchmark.zig +65 -102
- package/src/tigerbeetle/src/cli.zig +39 -18
- package/src/tigerbeetle/src/config.zig +44 -25
- package/src/tigerbeetle/src/demo.zig +2 -15
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +10 -10
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
- package/src/tigerbeetle/src/{demo_04_create_transfers_two_phase_commit.zig → demo_04_create_pending_transfers.zig} +18 -12
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +37 -0
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +24 -0
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
- package/src/tigerbeetle/src/io/benchmark.zig +24 -49
- package/src/tigerbeetle/src/io/darwin.zig +175 -44
- package/src/tigerbeetle/src/io/linux.zig +177 -72
- package/src/tigerbeetle/src/io/test.zig +61 -39
- package/src/tigerbeetle/src/io/windows.zig +1161 -0
- package/src/tigerbeetle/src/io.zig +2 -0
- package/src/tigerbeetle/src/main.zig +31 -10
- package/src/tigerbeetle/src/message_bus.zig +49 -61
- package/src/tigerbeetle/src/message_pool.zig +66 -57
- package/src/tigerbeetle/src/ring_buffer.zig +55 -3
- package/src/tigerbeetle/src/simulator.zig +108 -12
- package/src/tigerbeetle/src/state_machine.zig +1813 -816
- package/src/tigerbeetle/src/storage.zig +0 -230
- package/src/tigerbeetle/src/test/cluster.zig +168 -38
- package/src/tigerbeetle/src/test/message_bus.zig +4 -3
- package/src/tigerbeetle/src/test/network.zig +13 -16
- package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
- package/src/tigerbeetle/src/test/state_checker.zig +6 -3
- package/src/tigerbeetle/src/test/state_machine.zig +8 -7
- package/src/tigerbeetle/src/test/storage.zig +99 -40
- package/src/tigerbeetle/src/tigerbeetle.zig +108 -101
- package/src/tigerbeetle/src/time.zig +58 -11
- package/src/tigerbeetle/src/vsr/client.zig +18 -32
- package/src/tigerbeetle/src/vsr/clock.zig +1 -1
- package/src/tigerbeetle/src/vsr/journal.zig +1388 -464
- package/src/tigerbeetle/src/vsr/replica.zig +1340 -576
- package/src/tigerbeetle/src/vsr.zig +452 -40
- package/src/translate.zig +10 -0
- package/src/tigerbeetle/src/demo_05_accept_transfers.zig +0 -23
- package/src/tigerbeetle/src/demo_06_reject_transfers.zig +0 -17
- package/src/tigerbeetle/src/format_test.zig +0 -69
|
@@ -18,6 +18,24 @@ const log = std.log.scoped(.replica);
|
|
|
18
18
|
pub const Status = enum {
|
|
19
19
|
normal,
|
|
20
20
|
view_change,
|
|
21
|
+
// Recovery (for replica_count > 1):
|
|
22
|
+
//
|
|
23
|
+
// 1. At replica start: `status=recovering` and `journal.recovered=false`
|
|
24
|
+
// 2. Load the WAL. Mark questionable entries as faulty.
|
|
25
|
+
// 3. If the WAL has no entries (besides the initial commit), skip to step 5 with view 0.
|
|
26
|
+
// 4. Run VSR recovery protocol:
|
|
27
|
+
// a. Send a `recovery` message to every replica (except self).
|
|
28
|
+
// b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
|
|
29
|
+
// Each `recovery_response` includes the current view number.
|
|
30
|
+
// Each `recovery_response` must include a nonce matching the `recovery` message.
|
|
31
|
+
// c. Wait for a `recovery_response` from the leader of the highest known view.
|
|
32
|
+
// 5. Transition to `status=normal` with the discovered view number:
|
|
33
|
+
// * Set `op` to the highest op in the leader's recovery response.
|
|
34
|
+
// * Repair faulty messages.
|
|
35
|
+
// * Commit through to the discovered `commit_max`.
|
|
36
|
+
// * Set `state_machine.prepare_timeout` to the current op's timestamp.
|
|
37
|
+
//
|
|
38
|
+
// TODO document snapshot recovery in this progression
|
|
21
39
|
recovering,
|
|
22
40
|
};
|
|
23
41
|
|
|
@@ -47,19 +65,24 @@ const ClientTableEntry = struct {
|
|
|
47
65
|
reply: *Message,
|
|
48
66
|
};
|
|
49
67
|
|
|
68
|
+
const Nonce = u128;
|
|
69
|
+
|
|
50
70
|
const Prepare = struct {
|
|
51
71
|
/// The current prepare message (used to cross-check prepare_ok messages, and for resending).
|
|
52
72
|
message: *Message,
|
|
53
73
|
|
|
54
74
|
/// Unique prepare_ok messages for the same view, op number and checksum from ALL replicas.
|
|
55
|
-
ok_from_all_replicas:
|
|
75
|
+
ok_from_all_replicas: QuorumCounter = quorum_counter_null,
|
|
56
76
|
|
|
57
77
|
/// Whether a quorum of prepare_ok messages has been received for this prepare.
|
|
58
78
|
ok_quorum_received: bool = false,
|
|
59
79
|
};
|
|
60
80
|
|
|
61
81
|
const QuorumMessages = [config.replicas_max]?*Message;
|
|
62
|
-
const
|
|
82
|
+
const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
|
|
83
|
+
|
|
84
|
+
const QuorumCounter = std.StaticBitSet(config.replicas_max);
|
|
85
|
+
const quorum_counter_null = QuorumCounter.initEmpty();
|
|
63
86
|
|
|
64
87
|
pub fn Replica(
|
|
65
88
|
comptime StateMachine: type,
|
|
@@ -111,12 +134,17 @@ pub fn Replica(
|
|
|
111
134
|
view_normal: u32,
|
|
112
135
|
|
|
113
136
|
/// The current status, either normal, view_change, or recovering:
|
|
114
|
-
|
|
115
|
-
status: Status = .normal,
|
|
137
|
+
status: Status = .recovering,
|
|
116
138
|
|
|
117
139
|
/// The op number assigned to the most recently prepared operation:
|
|
118
140
|
op: u64,
|
|
119
141
|
|
|
142
|
+
/// The op of the highest checkpointed message.
|
|
143
|
+
// TODO Update this to use LSM storage.
|
|
144
|
+
// TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
|
|
145
|
+
// TODO Enforce invariant op≥op_checkpoint.
|
|
146
|
+
op_checkpoint: u64 = 0,
|
|
147
|
+
|
|
120
148
|
/// The op number of the latest committed and executed operation (according to the replica):
|
|
121
149
|
/// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
|
|
122
150
|
commit_min: u64,
|
|
@@ -133,7 +161,10 @@ pub fn Replica(
|
|
|
133
161
|
|
|
134
162
|
/// The leader's pipeline of inflight prepares waiting to commit in FIFO order.
|
|
135
163
|
/// This allows us to pipeline without the complexity of out-of-order commits.
|
|
136
|
-
|
|
164
|
+
///
|
|
165
|
+
/// After a view change, the old leader's pipeline is left untouched so that it is able to
|
|
166
|
+
/// help the new leader repair, even in the face of local storage faults.
|
|
167
|
+
pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
|
|
137
168
|
|
|
138
169
|
/// In some cases, a replica may send a message to itself. We do not submit these messages
|
|
139
170
|
/// to the message bus but rather queue them here for guaranteed immediate delivery, which
|
|
@@ -141,13 +172,16 @@ pub fn Replica(
|
|
|
141
172
|
loopback_queue: ?*Message = null,
|
|
142
173
|
|
|
143
174
|
/// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
|
|
144
|
-
start_view_change_from_other_replicas:
|
|
175
|
+
start_view_change_from_other_replicas: QuorumCounter = quorum_counter_null,
|
|
145
176
|
|
|
146
177
|
/// Unique do_view_change messages for the same view from ALL replicas (including ourself).
|
|
147
|
-
do_view_change_from_all_replicas: QuorumMessages =
|
|
178
|
+
do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
|
|
148
179
|
|
|
149
180
|
/// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
|
|
150
|
-
nack_prepare_from_other_replicas:
|
|
181
|
+
nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
|
|
182
|
+
|
|
183
|
+
/// Unique recovery_response messages from OTHER replicas (excluding ourself).
|
|
184
|
+
recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
|
|
151
185
|
|
|
152
186
|
/// Whether a replica has received a quorum of start_view_change messages for the view change:
|
|
153
187
|
start_view_change_quorum: bool = false,
|
|
@@ -186,6 +220,12 @@ pub fn Replica(
|
|
|
186
220
|
/// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
|
|
187
221
|
repair_timeout: Timeout,
|
|
188
222
|
|
|
223
|
+
/// The number of ticks before attempting to send another set of `recovery` messages.
|
|
224
|
+
recovery_timeout: Timeout,
|
|
225
|
+
|
|
226
|
+
/// The nonce of the `recovery` messages.
|
|
227
|
+
recovery_nonce: Nonce,
|
|
228
|
+
|
|
189
229
|
/// Used to provide deterministic entropy to `choose_any_other_replica()`.
|
|
190
230
|
/// Incremented whenever `choose_any_other_replica()` is called.
|
|
191
231
|
choose_any_other_replica_ticks: u64 = 0,
|
|
@@ -242,25 +282,27 @@ pub fn Replica(
|
|
|
242
282
|
try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
|
|
243
283
|
assert(client_table.capacity() >= config.clients_max);
|
|
244
284
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
285
|
+
const root_prepare = Header.root_prepare(cluster);
|
|
286
|
+
|
|
287
|
+
var clock = try Clock.init(
|
|
288
|
+
allocator,
|
|
289
|
+
replica_count,
|
|
290
|
+
replica,
|
|
291
|
+
time,
|
|
292
|
+
);
|
|
293
|
+
errdefer clock.deinit(allocator);
|
|
294
|
+
|
|
295
|
+
const journal = try Journal.init(allocator, storage, replica);
|
|
296
|
+
errdefer journal.deinit(allocator);
|
|
297
|
+
|
|
298
|
+
const recovery_nonce = blk: {
|
|
299
|
+
var nonce: [@sizeOf(Nonce)]u8 = undefined;
|
|
300
|
+
var hash = std.crypto.hash.Blake3.init(.{});
|
|
301
|
+
hash.update(std.mem.asBytes(&clock.monotonic()));
|
|
302
|
+
hash.update(&[_]u8{replica});
|
|
303
|
+
hash.final(&nonce);
|
|
304
|
+
break :blk @bitCast(Nonce, nonce);
|
|
261
305
|
};
|
|
262
|
-
init_prepare.set_checksum_body(&[0]u8{});
|
|
263
|
-
init_prepare.set_checksum();
|
|
264
306
|
|
|
265
307
|
var self = Self{
|
|
266
308
|
.cluster = cluster,
|
|
@@ -268,28 +310,16 @@ pub fn Replica(
|
|
|
268
310
|
.replica = replica,
|
|
269
311
|
.quorum_replication = quorum_replication,
|
|
270
312
|
.quorum_view_change = quorum_view_change,
|
|
271
|
-
.clock =
|
|
272
|
-
|
|
273
|
-
replica_count,
|
|
274
|
-
replica,
|
|
275
|
-
time,
|
|
276
|
-
),
|
|
277
|
-
.journal = try Journal.init(
|
|
278
|
-
allocator,
|
|
279
|
-
storage,
|
|
280
|
-
replica,
|
|
281
|
-
config.journal_size_max,
|
|
282
|
-
config.journal_headers_max,
|
|
283
|
-
&init_prepare,
|
|
284
|
-
),
|
|
313
|
+
.clock = clock,
|
|
314
|
+
.journal = journal,
|
|
285
315
|
.message_bus = message_bus,
|
|
286
316
|
.state_machine = state_machine,
|
|
287
317
|
.client_table = client_table,
|
|
288
|
-
.view =
|
|
289
|
-
.view_normal =
|
|
290
|
-
.op =
|
|
291
|
-
.commit_min =
|
|
292
|
-
.commit_max =
|
|
318
|
+
.view = root_prepare.view,
|
|
319
|
+
.view_normal = root_prepare.view,
|
|
320
|
+
.op = root_prepare.op,
|
|
321
|
+
.commit_min = root_prepare.commit,
|
|
322
|
+
.commit_max = root_prepare.commit,
|
|
293
323
|
.ping_timeout = Timeout{
|
|
294
324
|
.name = "ping_timeout",
|
|
295
325
|
.id = replica,
|
|
@@ -325,6 +355,12 @@ pub fn Replica(
|
|
|
325
355
|
.id = replica,
|
|
326
356
|
.after = 50,
|
|
327
357
|
},
|
|
358
|
+
.recovery_timeout = Timeout{
|
|
359
|
+
.name = "recovery_timeout",
|
|
360
|
+
.id = replica,
|
|
361
|
+
.after = 200,
|
|
362
|
+
},
|
|
363
|
+
.recovery_nonce = recovery_nonce,
|
|
328
364
|
.prng = std.rand.DefaultPrng.init(replica),
|
|
329
365
|
};
|
|
330
366
|
|
|
@@ -343,20 +379,7 @@ pub fn Replica(
|
|
|
343
379
|
config.clients_max,
|
|
344
380
|
});
|
|
345
381
|
|
|
346
|
-
|
|
347
|
-
// can race with tick()... before timeouts have been initialized:
|
|
348
|
-
assert(self.status == .normal);
|
|
349
|
-
if (self.leader()) {
|
|
350
|
-
log.debug("{}: init: leader", .{self.replica});
|
|
351
|
-
self.ping_timeout.start();
|
|
352
|
-
self.commit_timeout.start();
|
|
353
|
-
self.repair_timeout.start();
|
|
354
|
-
} else {
|
|
355
|
-
log.debug("{}: init: follower", .{self.replica});
|
|
356
|
-
self.ping_timeout.start();
|
|
357
|
-
self.normal_status_timeout.start();
|
|
358
|
-
self.repair_timeout.start();
|
|
359
|
-
}
|
|
382
|
+
assert(self.status == .recovering);
|
|
360
383
|
|
|
361
384
|
return self;
|
|
362
385
|
}
|
|
@@ -375,15 +398,7 @@ pub fn Replica(
|
|
|
375
398
|
self.client_table.deinit(allocator);
|
|
376
399
|
}
|
|
377
400
|
|
|
378
|
-
|
|
379
|
-
var it = self.pipeline.iterator();
|
|
380
|
-
while (it.next()) |prepare| {
|
|
381
|
-
self.message_bus.unref(prepare.message);
|
|
382
|
-
for (prepare.ok_from_all_replicas) |message| {
|
|
383
|
-
if (message) |m| self.message_bus.unref(m);
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
}
|
|
401
|
+
while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
|
|
387
402
|
|
|
388
403
|
if (self.loopback_queue) |loopback_message| {
|
|
389
404
|
assert(loopback_message.next == null);
|
|
@@ -391,13 +406,11 @@ pub fn Replica(
|
|
|
391
406
|
self.loopback_queue = null;
|
|
392
407
|
}
|
|
393
408
|
|
|
394
|
-
for (self.start_view_change_from_other_replicas) |message| {
|
|
395
|
-
if (message) |m| self.message_bus.unref(m);
|
|
396
|
-
}
|
|
397
409
|
for (self.do_view_change_from_all_replicas) |message| {
|
|
398
410
|
if (message) |m| self.message_bus.unref(m);
|
|
399
411
|
}
|
|
400
|
-
|
|
412
|
+
|
|
413
|
+
for (self.recovery_response_from_other_replicas) |message| {
|
|
401
414
|
if (message) |m| self.message_bus.unref(m);
|
|
402
415
|
}
|
|
403
416
|
}
|
|
@@ -414,12 +427,40 @@ pub fn Replica(
|
|
|
414
427
|
self.clock.tick();
|
|
415
428
|
|
|
416
429
|
if (!self.journal.recovered) {
|
|
417
|
-
self.journal.recover();
|
|
430
|
+
if (!self.journal.recovering) self.journal.recover();
|
|
418
431
|
return;
|
|
419
432
|
} else {
|
|
420
433
|
assert(!self.journal.recovering);
|
|
421
434
|
}
|
|
422
435
|
|
|
436
|
+
if (self.status == .recovering) {
|
|
437
|
+
if (self.recovery_timeout.ticking) {
|
|
438
|
+
// Continue running the VSR recovery protocol.
|
|
439
|
+
self.recovery_timeout.tick();
|
|
440
|
+
if (self.recovery_timeout.fired()) self.on_recovery_timeout();
|
|
441
|
+
} else if (self.journal.is_empty()) {
|
|
442
|
+
// The data file is brand new — no messages have ever been written.
|
|
443
|
+
// Transition to normal status; no need to run the VSR recovery protocol.
|
|
444
|
+
assert(self.journal.faulty.count == 0);
|
|
445
|
+
self.transition_to_normal_from_recovering_status(0);
|
|
446
|
+
assert(self.status == .normal);
|
|
447
|
+
} else if (self.replica_count == 1) {
|
|
448
|
+
// A cluster-of-one does not run the VSR recovery protocol.
|
|
449
|
+
if (self.journal.faulty.count != 0) @panic("journal is corrupt");
|
|
450
|
+
if (self.committing) return;
|
|
451
|
+
assert(self.op == 0);
|
|
452
|
+
self.op = self.journal.op_maximum();
|
|
453
|
+
self.commit_ops(self.op);
|
|
454
|
+
// The recovering→normal transition is deferred until all ops are committed.
|
|
455
|
+
} else {
|
|
456
|
+
// The journal just finished recovery.
|
|
457
|
+
// Now try to learn the current view via the VSR recovery protocol.
|
|
458
|
+
self.recovery_timeout.start();
|
|
459
|
+
self.recover();
|
|
460
|
+
}
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
|
|
423
464
|
self.ping_timeout.tick();
|
|
424
465
|
self.prepare_timeout.tick();
|
|
425
466
|
self.commit_timeout.tick();
|
|
@@ -443,11 +484,12 @@ pub fn Replica(
|
|
|
443
484
|
/// Called by the MessageBus to deliver a message to the replica.
|
|
444
485
|
pub fn on_message(self: *Self, message: *Message) void {
|
|
445
486
|
assert(self.loopback_queue == null);
|
|
487
|
+
assert(message.references > 0);
|
|
446
488
|
|
|
447
|
-
log.debug("{}: on_message: view={} status={
|
|
489
|
+
log.debug("{}: on_message: view={} status={} {}", .{
|
|
448
490
|
self.replica,
|
|
449
491
|
self.view,
|
|
450
|
-
|
|
492
|
+
self.status,
|
|
451
493
|
message.header,
|
|
452
494
|
});
|
|
453
495
|
|
|
@@ -469,7 +511,6 @@ pub fn Replica(
|
|
|
469
511
|
}
|
|
470
512
|
|
|
471
513
|
if (!self.journal.recovered) {
|
|
472
|
-
self.journal.recover();
|
|
473
514
|
log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
|
|
474
515
|
return;
|
|
475
516
|
} else {
|
|
@@ -488,7 +529,7 @@ pub fn Replica(
|
|
|
488
529
|
.do_view_change => self.on_do_view_change(message),
|
|
489
530
|
.start_view => self.on_start_view(message),
|
|
490
531
|
.recovery => self.on_recovery(message),
|
|
491
|
-
.recovery_response =>
|
|
532
|
+
.recovery_response => self.on_recovery_response(message),
|
|
492
533
|
.request_start_view => self.on_request_start_view(message),
|
|
493
534
|
.request_prepare => self.on_request_prepare(message),
|
|
494
535
|
.request_headers => self.on_request_headers(message),
|
|
@@ -548,7 +589,7 @@ pub fn Replica(
|
|
|
548
589
|
} else {
|
|
549
590
|
// Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
|
|
550
591
|
pong.op = message.header.op;
|
|
551
|
-
pong.
|
|
592
|
+
pong.timestamp = @bitCast(u64, self.clock.realtime());
|
|
552
593
|
self.send_header_to_replica(message.header.replica, pong);
|
|
553
594
|
}
|
|
554
595
|
}
|
|
@@ -558,7 +599,7 @@ pub fn Replica(
|
|
|
558
599
|
if (message.header.replica == self.replica) return;
|
|
559
600
|
|
|
560
601
|
const m0 = message.header.op;
|
|
561
|
-
const t1 = @bitCast(i64, message.header.
|
|
602
|
+
const t1 = @bitCast(i64, message.header.timestamp);
|
|
562
603
|
const m2 = self.clock.monotonic();
|
|
563
604
|
|
|
564
605
|
self.clock.learn(message.header.replica, m0, t1, m2);
|
|
@@ -566,9 +607,9 @@ pub fn Replica(
|
|
|
566
607
|
|
|
567
608
|
/// The primary advances op-number, adds the request to the end of the log, and updates the
|
|
568
609
|
/// information for this client in the client-table to contain the new request number, s.
|
|
569
|
-
/// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
|
|
570
|
-
/// view-number, m is the message it received from the client, n is the op-number
|
|
571
|
-
/// the request, and k is the commit-number.
|
|
610
|
+
/// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
|
|
611
|
+
/// current view-number, m is the message it received from the client, n is the op-number
|
|
612
|
+
/// it assigned to the request, and k is the commit-number.
|
|
572
613
|
fn on_request(self: *Self, message: *Message) void {
|
|
573
614
|
if (self.ignore_request_message(message)) return;
|
|
574
615
|
|
|
@@ -587,19 +628,30 @@ pub fn Replica(
|
|
|
587
628
|
|
|
588
629
|
log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
|
|
589
630
|
|
|
590
|
-
|
|
591
|
-
|
|
631
|
+
// Guard against the wall clock going backwards by taking the max with timestamps issued:
|
|
632
|
+
self.state_machine.prepare_timestamp = std.math.max(
|
|
633
|
+
// The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
|
|
634
|
+
// may be our first prepare as a recently elected leader:
|
|
635
|
+
std.math.max(
|
|
636
|
+
self.state_machine.prepare_timestamp,
|
|
637
|
+
self.state_machine.commit_timestamp,
|
|
638
|
+
) + 1,
|
|
639
|
+
@intCast(u64, realtime),
|
|
640
|
+
);
|
|
641
|
+
assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
|
|
642
|
+
|
|
643
|
+
const prepare_timestamp = self.state_machine.prepare(
|
|
592
644
|
message.header.operation.cast(StateMachine),
|
|
593
645
|
message.body(),
|
|
594
646
|
);
|
|
595
647
|
|
|
596
|
-
|
|
648
|
+
const latest_entry = self.journal.header_with_op(self.op).?;
|
|
597
649
|
message.header.parent = latest_entry.checksum;
|
|
598
650
|
message.header.context = message.header.checksum;
|
|
599
651
|
message.header.view = self.view;
|
|
600
652
|
message.header.op = self.op + 1;
|
|
601
653
|
message.header.commit = self.commit_max;
|
|
602
|
-
message.header.
|
|
654
|
+
message.header.timestamp = prepare_timestamp;
|
|
603
655
|
message.header.replica = self.replica;
|
|
604
656
|
message.header.command = .prepare;
|
|
605
657
|
|
|
@@ -608,7 +660,7 @@ pub fn Replica(
|
|
|
608
660
|
|
|
609
661
|
log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
|
|
610
662
|
|
|
611
|
-
self.pipeline.
|
|
663
|
+
self.pipeline.push_assume_capacity(.{ .message = message.ref() });
|
|
612
664
|
assert(self.pipeline.count >= 1);
|
|
613
665
|
|
|
614
666
|
if (self.pipeline.count == 1) {
|
|
@@ -618,6 +670,8 @@ pub fn Replica(
|
|
|
618
670
|
} else {
|
|
619
671
|
// Do not restart the prepare timeout as it is already ticking for another prepare.
|
|
620
672
|
assert(self.prepare_timeout.ticking);
|
|
673
|
+
const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
|
|
674
|
+
assert(previous.message.header.checksum == message.header.parent);
|
|
621
675
|
}
|
|
622
676
|
|
|
623
677
|
self.on_prepare(message);
|
|
@@ -631,22 +685,23 @@ pub fn Replica(
|
|
|
631
685
|
///
|
|
632
686
|
/// The leader starts by sending a prepare message to itself.
|
|
633
687
|
///
|
|
634
|
-
/// Each replica (including the leader) then forwards this prepare message to the next
|
|
635
|
-
/// in the configuration, in parallel to writing to its own journal, closing the
|
|
636
|
-
/// the next replica is back to the leader, in which case the replica does not
|
|
688
|
+
/// Each replica (including the leader) then forwards this prepare message to the next
|
|
689
|
+
/// replica in the configuration, in parallel to writing to its own journal, closing the
|
|
690
|
+
/// circle until the next replica is back to the leader, in which case the replica does not
|
|
691
|
+
/// forward.
|
|
637
692
|
///
|
|
638
693
|
/// This keeps the leader's outgoing bandwidth limited (one-for-one) to incoming bandwidth,
|
|
639
|
-
/// since the leader need only replicate to the next replica. Otherwise, the leader would
|
|
640
|
-
/// to replicate to multiple followers, dividing available bandwidth.
|
|
694
|
+
/// since the leader need only replicate to the next replica. Otherwise, the leader would
|
|
695
|
+
/// need to replicate to multiple followers, dividing available bandwidth.
|
|
641
696
|
///
|
|
642
|
-
/// This does not impact latency, since with Flexible Paxos we need only one remote
|
|
643
|
-
/// It is ideal if this synchronous replication to one remote replica is to the
|
|
644
|
-
/// since that is the replica next in line to be leader, which will need to
|
|
645
|
-
/// it can start the next view.
|
|
697
|
+
/// This does not impact latency, since with Flexible Paxos we need only one remote
|
|
698
|
+
/// prepare_ok. It is ideal if this synchronous replication to one remote replica is to the
|
|
699
|
+
/// next replica, since that is the replica next in line to be leader, which will need to
|
|
700
|
+
/// be up-to-date before it can start the next view.
|
|
646
701
|
///
|
|
647
|
-
/// At the same time, asynchronous replication keeps going, so that if our local disk is
|
|
648
|
-
/// then any latency spike will be masked by more remote prepare_ok messages as they
|
|
649
|
-
/// This gives automatic tail latency tolerance for storage latency spikes.
|
|
702
|
+
/// At the same time, asynchronous replication keeps going, so that if our local disk is
|
|
703
|
+
/// slow, then any latency spike will be masked by more remote prepare_ok messages as they
|
|
704
|
+
/// come in. This gives automatic tail latency tolerance for storage latency spikes.
|
|
650
705
|
///
|
|
651
706
|
/// The remaining problem then is tail latency tolerance for network latency spikes.
|
|
652
707
|
/// If the next replica is down or partitioned, then the leader's prepare timeout will fire,
|
|
@@ -675,12 +730,26 @@ pub fn Replica(
|
|
|
675
730
|
return;
|
|
676
731
|
}
|
|
677
732
|
|
|
733
|
+
// Verify that the new request will fit in the WAL.
|
|
734
|
+
if (message.header.op >= self.op_checkpoint + config.journal_slot_count) {
|
|
735
|
+
log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
736
|
+
self.replica,
|
|
737
|
+
message.header.op,
|
|
738
|
+
self.op_checkpoint,
|
|
739
|
+
});
|
|
740
|
+
// When we are the leader, `on_request` enforces this invariant.
|
|
741
|
+
assert(self.follower());
|
|
742
|
+
return;
|
|
743
|
+
}
|
|
744
|
+
|
|
678
745
|
assert(self.status == .normal);
|
|
679
746
|
assert(message.header.view == self.view);
|
|
680
747
|
assert(self.leader() or self.follower());
|
|
681
748
|
assert(message.header.replica == self.leader_index(message.header.view));
|
|
749
|
+
assert(message.header.op > self.op_checkpoint);
|
|
682
750
|
assert(message.header.op > self.op);
|
|
683
751
|
assert(message.header.op > self.commit_min);
|
|
752
|
+
assert(message.header.op < self.op_checkpoint + config.journal_slot_count);
|
|
684
753
|
|
|
685
754
|
if (self.follower()) self.normal_status_timeout.reset();
|
|
686
755
|
|
|
@@ -691,7 +760,7 @@ pub fn Replica(
|
|
|
691
760
|
|
|
692
761
|
if (self.journal.previous_entry(message.header)) |previous| {
|
|
693
762
|
// Any previous entry may be a whole journal's worth of ops behind due to wrapping.
|
|
694
|
-
// We therefore do not do any further op
|
|
763
|
+
// We therefore do not do any further op or checksum assertions beyond this:
|
|
695
764
|
self.panic_if_hash_chain_would_break_in_the_same_view(previous, message.header);
|
|
696
765
|
}
|
|
697
766
|
|
|
@@ -706,7 +775,7 @@ pub fn Replica(
|
|
|
706
775
|
});
|
|
707
776
|
assert(message.header.op == self.op + 1);
|
|
708
777
|
self.op = message.header.op;
|
|
709
|
-
self.journal.
|
|
778
|
+
self.journal.set_header_as_dirty(message.header);
|
|
710
779
|
|
|
711
780
|
self.replicate(message);
|
|
712
781
|
self.append(message);
|
|
@@ -735,7 +804,7 @@ pub fn Replica(
|
|
|
735
804
|
// Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
|
|
736
805
|
const threshold = self.quorum_replication;
|
|
737
806
|
|
|
738
|
-
const count = self.
|
|
807
|
+
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
739
808
|
&prepare.ok_from_all_replicas,
|
|
740
809
|
message,
|
|
741
810
|
threshold,
|
|
@@ -786,7 +855,7 @@ pub fn Replica(
|
|
|
786
855
|
assert(message.header.replica == self.leader_index(message.header.view));
|
|
787
856
|
|
|
788
857
|
// We may not always have the latest commit entry but if we do our checksum must match:
|
|
789
|
-
if (self.journal.
|
|
858
|
+
if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
|
|
790
859
|
if (commit_entry.checksum == message.header.context) {
|
|
791
860
|
log.debug("{}: on_commit: checksum verified", .{self.replica});
|
|
792
861
|
} else if (self.valid_hash_chain("on_commit")) {
|
|
@@ -798,7 +867,6 @@ pub fn Replica(
|
|
|
798
867
|
}
|
|
799
868
|
|
|
800
869
|
self.normal_status_timeout.reset();
|
|
801
|
-
|
|
802
870
|
self.commit_ops(message.header.commit);
|
|
803
871
|
}
|
|
804
872
|
|
|
@@ -878,32 +946,18 @@ pub fn Replica(
|
|
|
878
946
|
assert(self.status == .view_change);
|
|
879
947
|
assert(message.header.view == self.view);
|
|
880
948
|
|
|
881
|
-
if (self.leader_index(self.view) == self.replica) {
|
|
882
|
-
// If we are the leader of the new view, then wait until we have a message to send a
|
|
883
|
-
// do_view_change message to ourself. The on_do_view_change() handler will panic if
|
|
884
|
-
// we received a start_view_change quorum without a do_view_change to ourself.
|
|
885
|
-
if (self.message_bus.get_message()) |available| {
|
|
886
|
-
self.message_bus.unref(available);
|
|
887
|
-
} else {
|
|
888
|
-
log.err("{}: on_start_view_change: waiting for message for do_view_change", .{
|
|
889
|
-
self.replica,
|
|
890
|
-
});
|
|
891
|
-
return;
|
|
892
|
-
}
|
|
893
|
-
}
|
|
894
|
-
|
|
895
949
|
// Wait until we have `f` messages (excluding ourself) for quorum:
|
|
896
950
|
assert(self.replica_count > 1);
|
|
897
951
|
const threshold = self.quorum_view_change - 1;
|
|
898
952
|
|
|
899
|
-
const count = self.
|
|
953
|
+
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
900
954
|
&self.start_view_change_from_other_replicas,
|
|
901
955
|
message,
|
|
902
956
|
threshold,
|
|
903
957
|
) orelse return;
|
|
904
958
|
|
|
905
959
|
assert(count == threshold);
|
|
906
|
-
assert(self.start_view_change_from_other_replicas
|
|
960
|
+
assert(!self.start_view_change_from_other_replicas.isSet(self.replica));
|
|
907
961
|
log.debug("{}: on_start_view_change: view={} quorum received", .{
|
|
908
962
|
self.replica,
|
|
909
963
|
self.view,
|
|
@@ -956,7 +1010,7 @@ pub fn Replica(
|
|
|
956
1010
|
assert(self.replica_count > 1);
|
|
957
1011
|
const threshold = self.quorum_view_change;
|
|
958
1012
|
|
|
959
|
-
const count = self.
|
|
1013
|
+
const count = self.reference_message_and_receive_quorum_exactly_once(
|
|
960
1014
|
&self.do_view_change_from_all_replicas,
|
|
961
1015
|
message,
|
|
962
1016
|
threshold,
|
|
@@ -971,7 +1025,7 @@ pub fn Replica(
|
|
|
971
1025
|
|
|
972
1026
|
var v: ?u32 = null;
|
|
973
1027
|
var k: ?u64 = null;
|
|
974
|
-
var latest = Header.reserved();
|
|
1028
|
+
var latest = Header.reserved(self.cluster, 0);
|
|
975
1029
|
|
|
976
1030
|
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
977
1031
|
if (received) |m| {
|
|
@@ -982,10 +1036,10 @@ pub fn Replica(
|
|
|
982
1036
|
|
|
983
1037
|
// The latest normal view experienced by this replica:
|
|
984
1038
|
// This may be higher than the view in any of the prepare headers.
|
|
985
|
-
var replica_view_normal = @intCast(u32, m.header.
|
|
1039
|
+
var replica_view_normal = @intCast(u32, m.header.timestamp);
|
|
986
1040
|
assert(replica_view_normal < m.header.view);
|
|
987
1041
|
|
|
988
|
-
var replica_latest = Header.reserved();
|
|
1042
|
+
var replica_latest = Header.reserved(self.cluster, 0);
|
|
989
1043
|
set_latest_op(self.message_body_as_headers(m), &replica_latest);
|
|
990
1044
|
assert(replica_latest.op == m.header.op);
|
|
991
1045
|
|
|
@@ -1025,7 +1079,7 @@ pub fn Replica(
|
|
|
1025
1079
|
}
|
|
1026
1080
|
|
|
1027
1081
|
// Verify that the repairs above have not replaced or advanced the latest op:
|
|
1028
|
-
assert(self.journal.
|
|
1082
|
+
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
1029
1083
|
|
|
1030
1084
|
assert(self.start_view_change_quorum);
|
|
1031
1085
|
assert(!self.do_view_change_quorum);
|
|
@@ -1033,7 +1087,11 @@ pub fn Replica(
|
|
|
1033
1087
|
|
|
1034
1088
|
self.discard_uncommitted_headers();
|
|
1035
1089
|
assert(self.op >= self.commit_max);
|
|
1036
|
-
|
|
1090
|
+
|
|
1091
|
+
const prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
|
|
1092
|
+
if (self.state_machine.prepare_timestamp < prepare_timestamp) {
|
|
1093
|
+
self.state_machine.prepare_timestamp = prepare_timestamp;
|
|
1094
|
+
}
|
|
1037
1095
|
|
|
1038
1096
|
// Start repairs according to the CTRL protocol:
|
|
1039
1097
|
assert(!self.repair_timeout.ticking);
|
|
@@ -1061,7 +1119,7 @@ pub fn Replica(
|
|
|
1061
1119
|
assert(self.status == .view_change);
|
|
1062
1120
|
assert(message.header.view == self.view);
|
|
1063
1121
|
|
|
1064
|
-
var latest = Header.reserved();
|
|
1122
|
+
var latest = Header.reserved(self.cluster, 0);
|
|
1065
1123
|
set_latest_op(self.message_body_as_headers(message), &latest);
|
|
1066
1124
|
assert(latest.op == message.header.op);
|
|
1067
1125
|
|
|
@@ -1073,10 +1131,10 @@ pub fn Replica(
|
|
|
1073
1131
|
}
|
|
1074
1132
|
|
|
1075
1133
|
// Verify that the repairs above have not replaced or advanced the latest op:
|
|
1076
|
-
assert(self.journal.
|
|
1134
|
+
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
1077
1135
|
|
|
1078
1136
|
if (self.status == .view_change) {
|
|
1079
|
-
self.
|
|
1137
|
+
self.transition_to_normal_from_view_change_status(message.header.view);
|
|
1080
1138
|
self.send_prepare_oks_after_view_change();
|
|
1081
1139
|
}
|
|
1082
1140
|
|
|
@@ -1097,12 +1155,7 @@ pub fn Replica(
|
|
|
1097
1155
|
assert(message.header.replica != self.replica);
|
|
1098
1156
|
assert(self.leader());
|
|
1099
1157
|
|
|
1100
|
-
const start_view = self.create_view_change_message(.start_view)
|
|
1101
|
-
log.err("{}: on_request_start_view: dropping start_view, no message available", .{
|
|
1102
|
-
self.replica,
|
|
1103
|
-
});
|
|
1104
|
-
return;
|
|
1105
|
-
};
|
|
1158
|
+
const start_view = self.create_view_change_message(.start_view);
|
|
1106
1159
|
defer self.message_bus.unref(start_view);
|
|
1107
1160
|
|
|
1108
1161
|
assert(start_view.references == 1);
|
|
@@ -1114,8 +1167,9 @@ pub fn Replica(
|
|
|
1114
1167
|
self.send_message_to_replica(message.header.replica, start_view);
|
|
1115
1168
|
}
|
|
1116
1169
|
|
|
1117
|
-
/// TODO This is a work in progress (out of scope for the bounty)
|
|
1118
1170
|
fn on_recovery(self: *Self, message: *const Message) void {
|
|
1171
|
+
assert(self.replica_count > 1);
|
|
1172
|
+
|
|
1119
1173
|
if (self.status != .normal) {
|
|
1120
1174
|
log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
|
|
1121
1175
|
return;
|
|
@@ -1126,40 +1180,31 @@ pub fn Replica(
|
|
|
1126
1180
|
return;
|
|
1127
1181
|
}
|
|
1128
1182
|
|
|
1129
|
-
const response = self.message_bus.get_message()
|
|
1130
|
-
log.err("{}: on_recovery: ignoring (waiting for message)", .{self.replica});
|
|
1131
|
-
return;
|
|
1132
|
-
};
|
|
1183
|
+
const response = self.message_bus.get_message();
|
|
1133
1184
|
defer self.message_bus.unref(response);
|
|
1134
1185
|
|
|
1186
|
+
log.debug("{}: on_recovery: view={} op={} commit={} nonce={}", .{
|
|
1187
|
+
self.replica,
|
|
1188
|
+
self.view,
|
|
1189
|
+
self.op,
|
|
1190
|
+
self.commit_max,
|
|
1191
|
+
message.header.context,
|
|
1192
|
+
});
|
|
1193
|
+
|
|
1135
1194
|
response.header.* = .{
|
|
1136
1195
|
.command = .recovery_response,
|
|
1137
1196
|
.cluster = self.cluster,
|
|
1138
|
-
.context = message.header.context,
|
|
1197
|
+
.context = message.header.context, // Echo the request's nonce.
|
|
1139
1198
|
.replica = self.replica,
|
|
1140
1199
|
.view = self.view,
|
|
1141
1200
|
.op = self.op,
|
|
1142
1201
|
.commit = self.commit_max,
|
|
1143
1202
|
};
|
|
1144
1203
|
|
|
1145
|
-
const count_max = 8; // The number of prepare headers to include in the body.
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
1 + count_max,
|
|
1150
|
-
);
|
|
1151
|
-
assert(size_max > @sizeOf(Header));
|
|
1152
|
-
|
|
1153
|
-
const count = self.journal.copy_latest_headers_between(
|
|
1154
|
-
0,
|
|
1155
|
-
self.op,
|
|
1156
|
-
std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
|
|
1157
|
-
);
|
|
1158
|
-
|
|
1159
|
-
// We expect that self.op always exists.
|
|
1160
|
-
assert(count > 0);
|
|
1161
|
-
|
|
1162
|
-
response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
1204
|
+
const count_max = 8; // The maximum number of prepare headers to include in the body.
|
|
1205
|
+
const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, response);
|
|
1206
|
+
assert(count > 0); // We expect that self.op always exists.
|
|
1207
|
+
assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
|
|
1163
1208
|
|
|
1164
1209
|
response.header.set_checksum_body(response.body());
|
|
1165
1210
|
response.header.set_checksum();
|
|
@@ -1171,68 +1216,339 @@ pub fn Replica(
|
|
|
1171
1216
|
self.send_message_to_replica(message.header.replica, response);
|
|
1172
1217
|
}
|
|
1173
1218
|
|
|
1174
|
-
/// TODO This is a work in progress (out of scope for the bounty)
|
|
1175
1219
|
fn on_recovery_response(self: *Self, message: *Message) void {
|
|
1176
|
-
|
|
1177
|
-
|
|
1220
|
+
assert(self.replica_count > 1);
|
|
1221
|
+
|
|
1222
|
+
if (self.status != .recovering) {
|
|
1223
|
+
log.debug("{}: on_recovery_response: ignoring ({})", .{
|
|
1224
|
+
self.replica,
|
|
1225
|
+
self.status,
|
|
1226
|
+
});
|
|
1227
|
+
return;
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
if (message.header.replica == self.replica) {
|
|
1231
|
+
log.warn("{}: on_recovery_response: ignoring (self)", .{self.replica});
|
|
1232
|
+
return;
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
if (message.header.context != self.recovery_nonce) {
|
|
1236
|
+
log.warn("{}: on_recovery_response: ignoring (different nonce)", .{self.replica});
|
|
1237
|
+
return;
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
// Recovery messages with our nonce are not sent until after the journal is recovered.
|
|
1241
|
+
assert(self.journal.recovered);
|
|
1242
|
+
|
|
1243
|
+
var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
|
|
1244
|
+
if (responses[message.header.replica]) |existing| {
|
|
1245
|
+
assert(message.header.replica == existing.header.replica);
|
|
1246
|
+
|
|
1247
|
+
if (message.header.checksum == existing.header.checksum) {
|
|
1248
|
+
// The response was replayed by the network; ignore it.
|
|
1249
|
+
log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
|
|
1250
|
+
self.replica,
|
|
1251
|
+
});
|
|
1252
|
+
return;
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
// We received a second (distinct) response from a replica. Possible causes:
|
|
1256
|
+
// * We retried the `recovery` message, because we had not yet received a quorum.
|
|
1257
|
+
// * The `recovery` message was duplicated/misdirected by the network, and the
|
|
1258
|
+
// receiver's state changed in the mean time.
|
|
1259
|
+
|
|
1260
|
+
log.debug(
|
|
1261
|
+
"{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
|
|
1262
|
+
.{
|
|
1263
|
+
self.replica,
|
|
1264
|
+
existing.header.replica,
|
|
1265
|
+
existing.header.view,
|
|
1266
|
+
message.header.view,
|
|
1267
|
+
existing.header.op,
|
|
1268
|
+
message.header.op,
|
|
1269
|
+
existing.header.commit,
|
|
1270
|
+
message.header.commit,
|
|
1271
|
+
},
|
|
1272
|
+
);
|
|
1273
|
+
|
|
1274
|
+
if (message.header.view < existing.header.view or
|
|
1275
|
+
(message.header.view == existing.header.view and
|
|
1276
|
+
message.header.op < existing.header.op) or
|
|
1277
|
+
(message.header.view == existing.header.view and
|
|
1278
|
+
message.header.op == existing.header.op and
|
|
1279
|
+
message.header.commit < existing.header.commit))
|
|
1280
|
+
{
|
|
1281
|
+
// The second message is older than the first one (reordered packets).
|
|
1282
|
+
log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
|
|
1283
|
+
return;
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
// The second message is newer than the first one.
|
|
1287
|
+
assert(message.header.view >= existing.header.view);
|
|
1288
|
+
// The op number may regress if an uncommitted op was discarded in a higher view.
|
|
1289
|
+
assert(message.header.op >= existing.header.op or
|
|
1290
|
+
message.header.view > existing.header.view);
|
|
1291
|
+
assert(message.header.commit >= existing.header.commit);
|
|
1292
|
+
|
|
1293
|
+
self.message_bus.unref(existing);
|
|
1294
|
+
responses[message.header.replica] = null;
|
|
1295
|
+
} else {
|
|
1296
|
+
log.debug(
|
|
1297
|
+
"{}: on_recovery_response: replica={} view={} op={} commit={}",
|
|
1298
|
+
.{
|
|
1299
|
+
self.replica,
|
|
1300
|
+
message.header.replica,
|
|
1301
|
+
message.header.view,
|
|
1302
|
+
message.header.op,
|
|
1303
|
+
message.header.commit,
|
|
1304
|
+
},
|
|
1305
|
+
);
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
assert(responses[message.header.replica] == null);
|
|
1309
|
+
responses[message.header.replica] = message.ref();
|
|
1310
|
+
|
|
1311
|
+
// Wait until we have:
|
|
1312
|
+
// * at least `f + 1` messages for quorum (not including ourself), and
|
|
1313
|
+
// * a response from the leader of the highest discovered view.
|
|
1314
|
+
const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
|
|
1315
|
+
assert(count <= self.replica_count - 1);
|
|
1316
|
+
|
|
1317
|
+
const threshold = self.quorum_view_change;
|
|
1318
|
+
if (count < threshold) {
|
|
1319
|
+
log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
|
|
1320
|
+
self.replica,
|
|
1321
|
+
count,
|
|
1322
|
+
threshold,
|
|
1323
|
+
});
|
|
1324
|
+
return;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
const view = blk: { // The latest known view.
|
|
1328
|
+
var view: u32 = 0;
|
|
1329
|
+
for (self.recovery_response_from_other_replicas) |received, replica| {
|
|
1330
|
+
if (received) |response| {
|
|
1331
|
+
assert(replica != self.replica);
|
|
1332
|
+
assert(response.header.replica == replica);
|
|
1333
|
+
assert(response.header.context == self.recovery_nonce);
|
|
1334
|
+
|
|
1335
|
+
view = std.math.max(view, response.header.view);
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
break :blk view;
|
|
1339
|
+
};
|
|
1340
|
+
|
|
1341
|
+
const leader_response = responses[self.leader_index(view)];
|
|
1342
|
+
if (leader_response == null) {
|
|
1343
|
+
log.debug(
|
|
1344
|
+
"{}: on_recovery_response: ignoring (awaiting response from leader of view={})",
|
|
1345
|
+
.{
|
|
1346
|
+
self.replica,
|
|
1347
|
+
view,
|
|
1348
|
+
},
|
|
1349
|
+
);
|
|
1350
|
+
return;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
if (leader_response.?.header.view != view) {
|
|
1354
|
+
// The leader (according to the view quorum) isn't the leader (according to itself).
|
|
1355
|
+
// The `recovery_timeout` will retry shortly with another round.
|
|
1356
|
+
log.debug(
|
|
1357
|
+
"{}: on_recovery_response: ignoring (leader view={} != quorum view={})",
|
|
1358
|
+
.{
|
|
1359
|
+
self.replica,
|
|
1360
|
+
leader_response.?.header.view,
|
|
1361
|
+
view,
|
|
1362
|
+
},
|
|
1363
|
+
);
|
|
1364
|
+
return;
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
// This recovering→normal status transition occurs exactly once.
|
|
1368
|
+
// All further `recovery_response` messages are ignored.
|
|
1369
|
+
|
|
1370
|
+
// TODO When the view is recovered from the superblock (instead of via the VSR recovery
|
|
1371
|
+
// protocol), if the view number indicates that this replica is a leader, it must
|
|
1372
|
+
// transition to status=view_change instead of status=normal.
|
|
1373
|
+
|
|
1374
|
+
const leader_headers = self.message_body_as_headers(leader_response.?);
|
|
1375
|
+
assert(leader_headers.len > 0);
|
|
1376
|
+
|
|
1377
|
+
const commit = leader_response.?.header.commit;
|
|
1378
|
+
{
|
|
1379
|
+
var latest = Header.reserved(self.cluster, 0);
|
|
1380
|
+
set_latest_op(leader_headers, &latest);
|
|
1381
|
+
assert(latest.op == leader_response.?.header.op);
|
|
1382
|
+
|
|
1383
|
+
self.set_latest_op_and_k(&latest, commit, "on_recovery_response");
|
|
1384
|
+
assert(self.op == latest.op);
|
|
1385
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
assert(self.status == .recovering);
|
|
1389
|
+
self.transition_to_normal_from_recovering_status(view);
|
|
1390
|
+
assert(self.status == .normal);
|
|
1391
|
+
assert(self.follower());
|
|
1392
|
+
|
|
1393
|
+
// TODO If the view's primary is >1 WAL ahead of us, these headers could cause
|
|
1394
|
+
// problems. We don't want to jump this far ahead to repair, but we still need to use
|
|
1395
|
+
// the hash chain to figure out which headers to request. Maybe include our
|
|
1396
|
+
// `op_checkpoint` in the recovery (request) message so that the response can give more
|
|
1397
|
+
// useful (i.e. older) headers.
|
|
1398
|
+
for (leader_headers) |*header| {
|
|
1399
|
+
_ = self.repair_header(header);
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
if (self.op < config.journal_slot_count) {
|
|
1403
|
+
if (self.journal.header_with_op(0)) |header| {
|
|
1404
|
+
assert(header.command == .prepare);
|
|
1405
|
+
assert(header.operation == .root);
|
|
1406
|
+
} else {
|
|
1407
|
+
// This is the first wrap of the log, and the root prepare is corrupt.
|
|
1408
|
+
// Repair the root repair. This is necessary to maintain the invariant that the
|
|
1409
|
+
// op=commit_min exists in-memory.
|
|
1410
|
+
const header = Header.root_prepare(self.cluster);
|
|
1411
|
+
self.journal.set_header_as_dirty(&header);
|
|
1412
|
+
log.debug("{}: on_recovery_response: repair root op", .{self.replica});
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
|
|
1417
|
+
" commit={} dirty={} faulty={}", .{
|
|
1418
|
+
self.replica,
|
|
1419
|
+
count,
|
|
1420
|
+
view,
|
|
1421
|
+
leader_headers[leader_headers.len - 1].op,
|
|
1422
|
+
leader_headers[0].op,
|
|
1423
|
+
commit,
|
|
1424
|
+
self.journal.dirty.count,
|
|
1425
|
+
self.journal.faulty.count,
|
|
1426
|
+
});
|
|
1427
|
+
|
|
1428
|
+
self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
|
|
1429
|
+
// `state_machine.commit_timestamp` is updated as messages are committed.
|
|
1430
|
+
|
|
1431
|
+
self.reset_quorum_recovery_response();
|
|
1432
|
+
self.commit_ops(commit);
|
|
1433
|
+
self.repair();
|
|
1178
1434
|
}
|
|
1179
1435
|
|
|
1436
|
+
/// If the requested prepare has been guaranteed by this replica:
|
|
1437
|
+
/// * Read the prepare from storage, and forward it to the replica that requested it.
|
|
1438
|
+
/// * Otherwise send no reply — it isn't safe to nack.
|
|
1439
|
+
/// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
|
|
1440
|
+
///
|
|
1441
|
+
/// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
|
|
1442
|
+
/// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
|
|
1443
|
+
/// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
|
|
1444
|
+
/// to restore durability.
|
|
1180
1445
|
fn on_request_prepare(self: *Self, message: *const Message) void {
|
|
1181
1446
|
if (self.ignore_repair_message(message)) return;
|
|
1182
1447
|
|
|
1448
|
+
assert(self.replica_count > 1);
|
|
1183
1449
|
assert(self.status == .normal or self.status == .view_change);
|
|
1184
1450
|
assert(message.header.view == self.view);
|
|
1185
1451
|
assert(message.header.replica != self.replica);
|
|
1186
1452
|
|
|
1187
1453
|
const op = message.header.op;
|
|
1188
|
-
|
|
1189
|
-
|
|
1454
|
+
const slot = self.journal.slot_for_op(op);
|
|
1455
|
+
const checksum: ?u128 = switch (message.header.timestamp) {
|
|
1456
|
+
0 => null,
|
|
1457
|
+
1 => message.header.context,
|
|
1458
|
+
else => unreachable,
|
|
1459
|
+
};
|
|
1190
1460
|
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
assert(checksum == null or entry.checksum == checksum.?);
|
|
1461
|
+
// Only the leader may respond to `request_prepare` messages without a checksum.
|
|
1462
|
+
assert(checksum != null or self.leader_index(self.view) == self.replica);
|
|
1194
1463
|
|
|
1195
|
-
|
|
1196
|
-
|
|
1464
|
+
// Try to serve the message directly from the pipeline.
|
|
1465
|
+
// This saves us from going to disk. And we don't need to worry that the WAL's copy
|
|
1466
|
+
// of an uncommitted prepare is lost/corrupted.
|
|
1467
|
+
if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
|
|
1468
|
+
log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
|
|
1469
|
+
self.replica,
|
|
1470
|
+
op,
|
|
1471
|
+
checksum,
|
|
1472
|
+
});
|
|
1473
|
+
self.send_message_to_replica(message.header.replica, prepare.message);
|
|
1474
|
+
return;
|
|
1475
|
+
}
|
|
1197
1476
|
|
|
1477
|
+
if (self.journal.prepare_inhabited[slot.index]) {
|
|
1478
|
+
const prepare_checksum = self.journal.prepare_checksums[slot.index];
|
|
1479
|
+
// Consult `journal.prepare_checksums` (rather than `journal.headers`):
|
|
1480
|
+
// the former may have the prepare we want — even if journal recovery marked the
|
|
1481
|
+
// slot as faulty and left the in-memory header as reserved.
|
|
1482
|
+
if (checksum == null or checksum.? == prepare_checksum) {
|
|
1198
1483
|
log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
|
|
1199
1484
|
self.replica,
|
|
1200
1485
|
op,
|
|
1201
1486
|
checksum,
|
|
1202
1487
|
});
|
|
1203
1488
|
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1489
|
+
if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
1490
|
+
// The header for the target prepare is already in-memory.
|
|
1491
|
+
// This is preferable to the `else` case since we have the prepare's
|
|
1492
|
+
// `header.size` in-memory, so the read can be (potentially) shorter.
|
|
1493
|
+
// TODO Do not reissue the read if we are already reading in order to send
|
|
1494
|
+
// to this particular destination replica.
|
|
1495
|
+
self.journal.read_prepare(
|
|
1496
|
+
on_request_prepare_read,
|
|
1497
|
+
op,
|
|
1498
|
+
prepare_checksum,
|
|
1499
|
+
message.header.replica,
|
|
1500
|
+
);
|
|
1501
|
+
} else {
|
|
1502
|
+
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1503
|
+
// this particular destination replica.
|
|
1504
|
+
self.journal.read_prepare_with_op_and_checksum(
|
|
1505
|
+
on_request_prepare_read,
|
|
1506
|
+
op,
|
|
1507
|
+
prepare_checksum,
|
|
1508
|
+
message.header.replica,
|
|
1509
|
+
);
|
|
1510
|
+
}
|
|
1222
1511
|
|
|
1223
|
-
// We have
|
|
1512
|
+
// We have guaranteed the prepare (not safe to nack).
|
|
1513
|
+
// Our copy may or may not be valid, but we will try to read & forward it.
|
|
1224
1514
|
return;
|
|
1225
1515
|
}
|
|
1516
|
+
}
|
|
1226
1517
|
|
|
1227
|
-
|
|
1228
|
-
//
|
|
1518
|
+
{
|
|
1519
|
+
// We may have guaranteed the prepare but our copy is faulty (not safe to nack).
|
|
1520
|
+
if (self.journal.faulty.bit(slot)) return;
|
|
1521
|
+
if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
|
|
1522
|
+
if (self.journal.dirty.bit(slot)) {
|
|
1523
|
+
// We know of the prepare but have yet to write it (safe to nack).
|
|
1524
|
+
// Continue through below...
|
|
1525
|
+
} else {
|
|
1526
|
+
// We have guaranteed the prepare and our copy is clean (not safe to nack).
|
|
1527
|
+
return;
|
|
1528
|
+
}
|
|
1529
|
+
}
|
|
1229
1530
|
}
|
|
1230
1531
|
|
|
1532
|
+
// Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
|
|
1533
|
+
// new primary needs to repair its own WAL before starting the new view.
|
|
1534
|
+
//
|
|
1535
|
+
// This branch is only where the backup doesn't have the prepare and could possibly
|
|
1536
|
+
// send a nack as part of the CTRL protocol. Nacks only get sent during a view change
|
|
1537
|
+
// to help the new primary trim uncommitted ops that couldn't otherwise be repaired.
|
|
1538
|
+
// Without doing this, the cluster would become permanently unavailable. So backups
|
|
1539
|
+
// shouldn't respond to the `request_prepare` if the new view has already started,
|
|
1540
|
+
// they should also be in view change status, waiting for the new primary to start
|
|
1541
|
+
// the view.
|
|
1231
1542
|
if (self.status == .view_change) {
|
|
1232
1543
|
assert(message.header.replica == self.leader_index(self.view));
|
|
1233
1544
|
assert(checksum != null);
|
|
1234
|
-
|
|
1235
|
-
|
|
1545
|
+
|
|
1546
|
+
if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
1547
|
+
assert(self.journal.dirty.bit(slot) and !self.journal.faulty.bit(slot));
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
if (self.journal.prepare_inhabited[slot.index]) {
|
|
1551
|
+
assert(self.journal.prepare_checksums[slot.index] != checksum.?);
|
|
1236
1552
|
}
|
|
1237
1553
|
|
|
1238
1554
|
log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
|
|
@@ -1276,14 +1592,7 @@ pub fn Replica(
|
|
|
1276
1592
|
assert(message.header.view == self.view);
|
|
1277
1593
|
assert(message.header.replica != self.replica);
|
|
1278
1594
|
|
|
1279
|
-
const response = self.message_bus.get_message()
|
|
1280
|
-
log.err("{}: on_request_headers: ignoring (op={}..{}, no message available)", .{
|
|
1281
|
-
self.replica,
|
|
1282
|
-
message.header.commit,
|
|
1283
|
-
message.header.op,
|
|
1284
|
-
});
|
|
1285
|
-
return;
|
|
1286
|
-
};
|
|
1595
|
+
const response = self.message_bus.get_message();
|
|
1287
1596
|
defer self.message_bus.unref(response);
|
|
1288
1597
|
|
|
1289
1598
|
response.header.* = .{
|
|
@@ -1299,21 +1608,9 @@ pub fn Replica(
|
|
|
1299
1608
|
const op_max = message.header.op;
|
|
1300
1609
|
assert(op_max >= op_min);
|
|
1301
1610
|
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
assert(
|
|
1305
|
-
|
|
1306
|
-
const size_max = @sizeOf(Header) * std.math.min(
|
|
1307
|
-
std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
|
|
1308
|
-
1 + count_max,
|
|
1309
|
-
);
|
|
1310
|
-
assert(size_max > @sizeOf(Header));
|
|
1311
|
-
|
|
1312
|
-
const count = self.journal.copy_latest_headers_between(
|
|
1313
|
-
op_min,
|
|
1314
|
-
op_max,
|
|
1315
|
-
std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
|
|
1316
|
-
);
|
|
1611
|
+
const count = self.copy_latest_headers_and_set_size(op_min, op_max, null, response);
|
|
1612
|
+
assert(count >= 0);
|
|
1613
|
+
assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
|
|
1317
1614
|
|
|
1318
1615
|
if (count == 0) {
|
|
1319
1616
|
log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
|
|
@@ -1324,8 +1621,6 @@ pub fn Replica(
|
|
|
1324
1621
|
return;
|
|
1325
1622
|
}
|
|
1326
1623
|
|
|
1327
|
-
response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
1328
|
-
|
|
1329
1624
|
response.header.set_checksum_body(response.body());
|
|
1330
1625
|
response.header.set_checksum();
|
|
1331
1626
|
|
|
@@ -1348,7 +1643,8 @@ pub fn Replica(
|
|
|
1348
1643
|
}
|
|
1349
1644
|
|
|
1350
1645
|
const op = self.nack_prepare_op.?;
|
|
1351
|
-
const checksum = self.journal.
|
|
1646
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
1647
|
+
const slot = self.journal.slot_with_op(op).?;
|
|
1352
1648
|
|
|
1353
1649
|
if (message.header.op != op) {
|
|
1354
1650
|
log.debug("{}: on_nack_prepare: ignoring (repairing another op)", .{self.replica});
|
|
@@ -1383,14 +1679,14 @@ pub fn Replica(
|
|
|
1383
1679
|
// Otherwise, if we know we do not have the op, then we can exclude ourselves.
|
|
1384
1680
|
assert(self.replica_count > 1);
|
|
1385
1681
|
|
|
1386
|
-
const threshold = if (self.journal.faulty.bit(
|
|
1682
|
+
const threshold = if (self.journal.faulty.bit(slot))
|
|
1387
1683
|
self.replica_count - self.quorum_replication + 1
|
|
1388
1684
|
else
|
|
1389
1685
|
self.replica_count - self.quorum_replication;
|
|
1390
1686
|
|
|
1391
1687
|
if (threshold == 0) {
|
|
1392
1688
|
assert(self.replica_count == 2);
|
|
1393
|
-
assert(!self.journal.faulty.bit(
|
|
1689
|
+
assert(!self.journal.faulty.bit(slot));
|
|
1394
1690
|
|
|
1395
1691
|
// This is a special case for a cluster-of-two, handled in `repair_prepare()`.
|
|
1396
1692
|
log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
|
|
@@ -1399,10 +1695,11 @@ pub fn Replica(
|
|
|
1399
1695
|
return;
|
|
1400
1696
|
}
|
|
1401
1697
|
|
|
1402
|
-
log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
|
|
1698
|
+
log.debug("{}: on_nack_prepare: quorum_replication={} threshold={} op={}", .{
|
|
1403
1699
|
self.replica,
|
|
1404
1700
|
self.quorum_replication,
|
|
1405
1701
|
threshold,
|
|
1702
|
+
op,
|
|
1406
1703
|
});
|
|
1407
1704
|
|
|
1408
1705
|
// We should never expect to receive a nack from ourselves:
|
|
@@ -1410,15 +1707,15 @@ pub fn Replica(
|
|
|
1410
1707
|
assert(threshold < self.replica_count);
|
|
1411
1708
|
|
|
1412
1709
|
// Wait until we have `threshold` messages for quorum:
|
|
1413
|
-
const count = self.
|
|
1710
|
+
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
1414
1711
|
&self.nack_prepare_from_other_replicas,
|
|
1415
1712
|
message,
|
|
1416
1713
|
threshold,
|
|
1417
1714
|
) orelse return;
|
|
1418
1715
|
|
|
1419
1716
|
assert(count == threshold);
|
|
1420
|
-
assert(self.nack_prepare_from_other_replicas
|
|
1421
|
-
log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
|
|
1717
|
+
assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
|
|
1718
|
+
log.debug("{}: on_nack_prepare: quorum received op={}", .{ self.replica, op });
|
|
1422
1719
|
|
|
1423
1720
|
self.discard_uncommitted_ops_from(op, checksum);
|
|
1424
1721
|
self.reset_quorum_nack_prepare();
|
|
@@ -1487,23 +1784,37 @@ pub fn Replica(
|
|
|
1487
1784
|
// The list of remote replicas yet to send a prepare_ok:
|
|
1488
1785
|
var waiting: [config.replicas_max]u8 = undefined;
|
|
1489
1786
|
var waiting_len: usize = 0;
|
|
1490
|
-
|
|
1491
|
-
|
|
1787
|
+
var ok_from_all_replicas_iterator = prepare.ok_from_all_replicas.iterator(.{
|
|
1788
|
+
.kind = .unset,
|
|
1789
|
+
});
|
|
1790
|
+
while (ok_from_all_replicas_iterator.next()) |replica| {
|
|
1791
|
+
// Ensure we don't wait for replicas that don't exist.
|
|
1792
|
+
// The bits between `replica_count` and `replicas_max` are always unset,
|
|
1793
|
+
// since they don't actually represent replicas.
|
|
1794
|
+
if (replica == self.replica_count) {
|
|
1795
|
+
assert(self.replica_count < config.replicas_max);
|
|
1796
|
+
break;
|
|
1797
|
+
}
|
|
1798
|
+
assert(replica < self.replica_count);
|
|
1799
|
+
|
|
1800
|
+
if (replica != self.replica) {
|
|
1492
1801
|
waiting[waiting_len] = @intCast(u8, replica);
|
|
1493
1802
|
waiting_len += 1;
|
|
1494
1803
|
}
|
|
1804
|
+
} else {
|
|
1805
|
+
assert(self.replica_count == config.replicas_max);
|
|
1495
1806
|
}
|
|
1496
1807
|
|
|
1497
1808
|
if (waiting_len == 0) {
|
|
1498
1809
|
self.prepare_timeout.reset();
|
|
1499
1810
|
|
|
1500
1811
|
log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
|
|
1501
|
-
assert(prepare.ok_from_all_replicas
|
|
1812
|
+
assert(!prepare.ok_from_all_replicas.isSet(self.replica));
|
|
1502
1813
|
|
|
1503
1814
|
// We may be slow and waiting for the write to complete.
|
|
1504
1815
|
//
|
|
1505
1816
|
// We may even have maxed out our IO depth and been unable to initiate the write,
|
|
1506
|
-
// which can happen if `config.
|
|
1817
|
+
// which can happen if `config.pipeline_max` exceeds `config.io_depth_write`.
|
|
1507
1818
|
// This can lead to deadlock for a cluster of one or two (if we do not retry here),
|
|
1508
1819
|
// since there is no other way for the leader to repair the dirty op because no
|
|
1509
1820
|
// other replica has it.
|
|
@@ -1533,7 +1844,10 @@ pub fn Replica(
|
|
|
1533
1844
|
const replica = waiting[self.prepare_timeout.attempts % waiting_len];
|
|
1534
1845
|
assert(replica != self.replica);
|
|
1535
1846
|
|
|
1536
|
-
log.debug("{}: on_prepare_timeout: replicating to replica {}", .{
|
|
1847
|
+
log.debug("{}: on_prepare_timeout: replicating to replica {}", .{
|
|
1848
|
+
self.replica,
|
|
1849
|
+
replica,
|
|
1850
|
+
});
|
|
1537
1851
|
self.send_message_to_replica(replica, prepare.message);
|
|
1538
1852
|
}
|
|
1539
1853
|
|
|
@@ -1545,7 +1859,7 @@ pub fn Replica(
|
|
|
1545
1859
|
assert(self.commit_min == self.commit_max);
|
|
1546
1860
|
|
|
1547
1861
|
// TODO Snapshots: Use snapshot checksum if commit is no longer in journal.
|
|
1548
|
-
const latest_committed_entry = self.journal.
|
|
1862
|
+
const latest_committed_entry = self.journal.header_with_op(self.commit_max).?;
|
|
1549
1863
|
|
|
1550
1864
|
self.send_header_to_other_replicas(.{
|
|
1551
1865
|
.command = .commit,
|
|
@@ -1590,7 +1904,14 @@ pub fn Replica(
|
|
|
1590
1904
|
self.repair();
|
|
1591
1905
|
}
|
|
1592
1906
|
|
|
1593
|
-
fn
|
|
1907
|
+
fn on_recovery_timeout(self: *Self) void {
|
|
1908
|
+
assert(self.status == .recovering);
|
|
1909
|
+
assert(self.replica_count > 1);
|
|
1910
|
+
self.recovery_timeout.reset();
|
|
1911
|
+
self.recover();
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
fn reference_message_and_receive_quorum_exactly_once(
|
|
1594
1915
|
self: *Self,
|
|
1595
1916
|
messages: *QuorumMessages,
|
|
1596
1917
|
message: *Message,
|
|
@@ -1604,18 +1925,6 @@ pub fn Replica(
|
|
|
1604
1925
|
assert(message.header.replica < self.replica_count);
|
|
1605
1926
|
assert(message.header.view == self.view);
|
|
1606
1927
|
switch (message.header.command) {
|
|
1607
|
-
.prepare_ok => {
|
|
1608
|
-
if (self.replica_count <= 2) assert(threshold == self.replica_count);
|
|
1609
|
-
|
|
1610
|
-
assert(self.status == .normal);
|
|
1611
|
-
assert(self.leader());
|
|
1612
|
-
},
|
|
1613
|
-
.start_view_change => {
|
|
1614
|
-
assert(self.replica_count > 1);
|
|
1615
|
-
if (self.replica_count == 2) assert(threshold == 1);
|
|
1616
|
-
|
|
1617
|
-
assert(self.status == .view_change);
|
|
1618
|
-
},
|
|
1619
1928
|
.do_view_change => {
|
|
1620
1929
|
assert(self.replica_count > 1);
|
|
1621
1930
|
if (self.replica_count == 2) assert(threshold == 2);
|
|
@@ -1623,13 +1932,6 @@ pub fn Replica(
|
|
|
1623
1932
|
assert(self.status == .view_change);
|
|
1624
1933
|
assert(self.leader_index(self.view) == self.replica);
|
|
1625
1934
|
},
|
|
1626
|
-
.nack_prepare => {
|
|
1627
|
-
assert(self.replica_count > 1);
|
|
1628
|
-
if (self.replica_count == 2) assert(threshold >= 1);
|
|
1629
|
-
|
|
1630
|
-
assert(self.status == .view_change);
|
|
1631
|
-
assert(self.leader_index(self.view) == self.replica);
|
|
1632
|
-
},
|
|
1633
1935
|
else => unreachable,
|
|
1634
1936
|
}
|
|
1635
1937
|
|
|
@@ -1665,8 +1967,88 @@ pub fn Replica(
|
|
|
1665
1967
|
|
|
1666
1968
|
// This is not the first time we have had quorum, the state transition has already happened:
|
|
1667
1969
|
if (count > threshold) {
|
|
1668
|
-
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
1669
|
-
|
|
1970
|
+
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
1971
|
+
self.replica,
|
|
1972
|
+
command,
|
|
1973
|
+
});
|
|
1974
|
+
return null;
|
|
1975
|
+
}
|
|
1976
|
+
|
|
1977
|
+
assert(count == threshold);
|
|
1978
|
+
return count;
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
fn count_message_and_receive_quorum_exactly_once(
|
|
1982
|
+
self: *Self,
|
|
1983
|
+
counter: *QuorumCounter,
|
|
1984
|
+
message: *Message,
|
|
1985
|
+
threshold: u32,
|
|
1986
|
+
) ?usize {
|
|
1987
|
+
assert(threshold >= 1);
|
|
1988
|
+
assert(threshold <= self.replica_count);
|
|
1989
|
+
|
|
1990
|
+
assert(QuorumCounter.bit_length == config.replicas_max);
|
|
1991
|
+
assert(message.header.cluster == self.cluster);
|
|
1992
|
+
assert(message.header.replica < self.replica_count);
|
|
1993
|
+
assert(message.header.view == self.view);
|
|
1994
|
+
|
|
1995
|
+
switch (message.header.command) {
|
|
1996
|
+
.prepare_ok => {
|
|
1997
|
+
if (self.replica_count <= 2) assert(threshold == self.replica_count);
|
|
1998
|
+
|
|
1999
|
+
assert(self.status == .normal);
|
|
2000
|
+
assert(self.leader());
|
|
2001
|
+
},
|
|
2002
|
+
.start_view_change => {
|
|
2003
|
+
assert(self.replica_count > 1);
|
|
2004
|
+
if (self.replica_count == 2) assert(threshold == 1);
|
|
2005
|
+
|
|
2006
|
+
assert(self.status == .view_change);
|
|
2007
|
+
},
|
|
2008
|
+
.nack_prepare => {
|
|
2009
|
+
assert(self.replica_count > 1);
|
|
2010
|
+
if (self.replica_count == 2) assert(threshold >= 1);
|
|
2011
|
+
|
|
2012
|
+
assert(self.status == .view_change);
|
|
2013
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
2014
|
+
},
|
|
2015
|
+
else => unreachable,
|
|
2016
|
+
}
|
|
2017
|
+
|
|
2018
|
+
const command: []const u8 = @tagName(message.header.command);
|
|
2019
|
+
|
|
2020
|
+
// Do not allow duplicate messages to trigger multiple passes through a state transition:
|
|
2021
|
+
if (counter.isSet(message.header.replica)) {
|
|
2022
|
+
log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
|
|
2023
|
+
self.replica,
|
|
2024
|
+
command,
|
|
2025
|
+
message.header.replica,
|
|
2026
|
+
});
|
|
2027
|
+
return null;
|
|
2028
|
+
}
|
|
2029
|
+
|
|
2030
|
+
// Record the first receipt of this message:
|
|
2031
|
+
counter.set(message.header.replica);
|
|
2032
|
+
assert(counter.isSet(message.header.replica));
|
|
2033
|
+
|
|
2034
|
+
// Count the number of unique messages now received:
|
|
2035
|
+
const count = counter.count();
|
|
2036
|
+
log.debug("{}: on_{s}: {} message(s)", .{ self.replica, command, count });
|
|
2037
|
+
assert(count <= self.replica_count);
|
|
2038
|
+
|
|
2039
|
+
// Wait until we have exactly `threshold` messages for quorum:
|
|
2040
|
+
if (count < threshold) {
|
|
2041
|
+
log.debug("{}: on_{s}: waiting for quorum", .{ self.replica, command });
|
|
2042
|
+
return null;
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
// This is not the first time we have had quorum, the state transition has already happened:
|
|
2046
|
+
if (count > threshold) {
|
|
2047
|
+
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
2048
|
+
self.replica,
|
|
2049
|
+
command,
|
|
2050
|
+
});
|
|
2051
|
+
return null;
|
|
1670
2052
|
}
|
|
1671
2053
|
|
|
1672
2054
|
assert(count == threshold);
|
|
@@ -1679,8 +2061,15 @@ pub fn Replica(
|
|
|
1679
2061
|
assert(message.header.view == self.view);
|
|
1680
2062
|
assert(message.header.op == self.op);
|
|
1681
2063
|
|
|
1682
|
-
|
|
1683
|
-
|
|
2064
|
+
if (self.replica_count == 1 and self.pipeline.count > 1) {
|
|
2065
|
+
// In a cluster-of-one, the prepares must always be written to the WAL sequentially
|
|
2066
|
+
// (never concurrently). This ensures that there will be no gaps in the WAL during
|
|
2067
|
+
// crash recovery.
|
|
2068
|
+
log.debug("{}: append: serializing append op={}", .{ self.replica, message.header.op });
|
|
2069
|
+
} else {
|
|
2070
|
+
log.debug("{}: append: appending to journal", .{self.replica});
|
|
2071
|
+
self.write_prepare(message, .append);
|
|
2072
|
+
}
|
|
1684
2073
|
}
|
|
1685
2074
|
|
|
1686
2075
|
/// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
|
|
@@ -1731,7 +2120,8 @@ pub fn Replica(
|
|
|
1731
2120
|
fn commit_ops(self: *Self, commit: u64) void {
|
|
1732
2121
|
// TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
|
|
1733
2122
|
// Be careful of concurrency when doing this, as successive view changes can happen quickly.
|
|
1734
|
-
assert(self.status == .normal or self.status == .view_change
|
|
2123
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2124
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
1735
2125
|
assert(self.commit_min <= self.commit_max);
|
|
1736
2126
|
assert(self.commit_min <= self.op);
|
|
1737
2127
|
assert(self.commit_max <= self.op or self.commit_max > self.op);
|
|
@@ -1775,12 +2165,14 @@ pub fn Replica(
|
|
|
1775
2165
|
|
|
1776
2166
|
fn commit_ops_read(self: *Self) void {
|
|
1777
2167
|
assert(self.committing);
|
|
1778
|
-
assert(self.status == .normal or self.status == .view_change
|
|
2168
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2169
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
1779
2170
|
assert(self.commit_min <= self.commit_max);
|
|
1780
2171
|
assert(self.commit_min <= self.op);
|
|
1781
2172
|
|
|
1782
2173
|
if (!self.valid_hash_chain("commit_ops_read")) {
|
|
1783
2174
|
self.committing = false;
|
|
2175
|
+
assert(self.replica_count > 1);
|
|
1784
2176
|
return;
|
|
1785
2177
|
}
|
|
1786
2178
|
assert(self.op >= self.commit_max);
|
|
@@ -1789,12 +2181,22 @@ pub fn Replica(
|
|
|
1789
2181
|
// Even a naive state transfer may fail to correct for this.
|
|
1790
2182
|
if (self.commit_min < self.commit_max and self.commit_min < self.op) {
|
|
1791
2183
|
const op = self.commit_min + 1;
|
|
1792
|
-
const checksum = self.journal.
|
|
2184
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
1793
2185
|
self.journal.read_prepare(commit_ops_commit, op, checksum, null);
|
|
1794
2186
|
} else {
|
|
1795
2187
|
self.committing = false;
|
|
1796
2188
|
// This is an optimization to expedite the view change before the `repair_timeout`:
|
|
1797
2189
|
if (self.status == .view_change and self.repairs_allowed()) self.repair();
|
|
2190
|
+
|
|
2191
|
+
if (self.status == .recovering) {
|
|
2192
|
+
assert(self.replica_count == 1);
|
|
2193
|
+
assert(self.commit_min == self.commit_max);
|
|
2194
|
+
assert(self.commit_min == self.op);
|
|
2195
|
+
self.transition_to_normal_from_recovering_status(0);
|
|
2196
|
+
} else {
|
|
2197
|
+
// We expect that a cluster-of-one only calls commit_ops() in recovering status.
|
|
2198
|
+
assert(self.replica_count > 1);
|
|
2199
|
+
}
|
|
1798
2200
|
}
|
|
1799
2201
|
}
|
|
1800
2202
|
|
|
@@ -1806,42 +2208,43 @@ pub fn Replica(
|
|
|
1806
2208
|
|
|
1807
2209
|
if (prepare == null) {
|
|
1808
2210
|
log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
|
|
2211
|
+
if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
|
|
1809
2212
|
return;
|
|
1810
2213
|
}
|
|
1811
2214
|
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
2215
|
+
switch (self.status) {
|
|
2216
|
+
.normal => {},
|
|
2217
|
+
.view_change => {
|
|
2218
|
+
if (self.leader_index(self.view) != self.replica) {
|
|
2219
|
+
log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
|
|
2220
|
+
assert(self.replica_count > 1);
|
|
2221
|
+
return;
|
|
2222
|
+
}
|
|
1817
2223
|
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
2224
|
+
// Only the leader may commit during a view change before starting the new view.
|
|
2225
|
+
// Fall through if this is indeed the case.
|
|
2226
|
+
},
|
|
2227
|
+
.recovering => {
|
|
2228
|
+
assert(self.replica_count == 1);
|
|
2229
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
2230
|
+
},
|
|
1823
2231
|
}
|
|
1824
2232
|
|
|
1825
2233
|
const op = self.commit_min + 1;
|
|
1826
2234
|
|
|
1827
2235
|
if (prepare.?.header.op != op) {
|
|
1828
2236
|
log.debug("{}: commit_ops_commit: op changed", .{self.replica});
|
|
2237
|
+
assert(self.replica_count > 1);
|
|
1829
2238
|
return;
|
|
1830
2239
|
}
|
|
1831
2240
|
|
|
1832
|
-
if (prepare.?.header.checksum != self.journal.
|
|
2241
|
+
if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
|
|
1833
2242
|
log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
|
|
2243
|
+
assert(self.replica_count > 1);
|
|
1834
2244
|
return;
|
|
1835
2245
|
}
|
|
1836
2246
|
|
|
1837
|
-
|
|
1838
|
-
const reply = self.message_bus.get_message() orelse {
|
|
1839
|
-
log.err("{}: commit_ops_commit: waiting for message", .{self.replica});
|
|
1840
|
-
return;
|
|
1841
|
-
};
|
|
1842
|
-
defer self.message_bus.unref(reply);
|
|
1843
|
-
|
|
1844
|
-
self.commit_op(prepare.?, reply);
|
|
2247
|
+
self.commit_op(prepare.?);
|
|
1845
2248
|
|
|
1846
2249
|
assert(self.commit_min == op);
|
|
1847
2250
|
assert(self.commit_min <= self.commit_max);
|
|
@@ -1851,11 +2254,12 @@ pub fn Replica(
|
|
|
1851
2254
|
self.commit_ops_read();
|
|
1852
2255
|
}
|
|
1853
2256
|
|
|
1854
|
-
fn commit_op(self: *Self, prepare: *const Message
|
|
2257
|
+
fn commit_op(self: *Self, prepare: *const Message) void {
|
|
1855
2258
|
// TODO Can we add more checks around allowing commit_op() during a view change?
|
|
1856
|
-
assert(self.status == .normal or self.status == .view_change
|
|
2259
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2260
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
1857
2261
|
assert(prepare.header.command == .prepare);
|
|
1858
|
-
assert(prepare.header.operation != .
|
|
2262
|
+
assert(prepare.header.operation != .root);
|
|
1859
2263
|
assert(prepare.header.op == self.commit_min + 1);
|
|
1860
2264
|
assert(prepare.header.op <= self.op);
|
|
1861
2265
|
|
|
@@ -1863,7 +2267,7 @@ pub fn Replica(
|
|
|
1863
2267
|
// happened since we last checked in `commit_ops_read()`. However, this would relate to
|
|
1864
2268
|
// subsequent ops, since by now we have already verified the hash chain for this commit.
|
|
1865
2269
|
|
|
1866
|
-
assert(self.journal.
|
|
2270
|
+
assert(self.journal.header_with_op(self.commit_min).?.checksum ==
|
|
1867
2271
|
prepare.header.parent);
|
|
1868
2272
|
|
|
1869
2273
|
log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
|
|
@@ -1875,6 +2279,11 @@ pub fn Replica(
|
|
|
1875
2279
|
@tagName(prepare.header.operation.cast(StateMachine)),
|
|
1876
2280
|
});
|
|
1877
2281
|
|
|
2282
|
+
const reply = self.message_bus.get_message();
|
|
2283
|
+
defer self.message_bus.unref(reply);
|
|
2284
|
+
|
|
2285
|
+
assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
|
|
2286
|
+
|
|
1878
2287
|
const reply_body_size = @intCast(u32, self.state_machine.commit(
|
|
1879
2288
|
prepare.header.client,
|
|
1880
2289
|
prepare.header.operation.cast(StateMachine),
|
|
@@ -1882,6 +2291,9 @@ pub fn Replica(
|
|
|
1882
2291
|
reply.buffer[@sizeOf(Header)..],
|
|
1883
2292
|
));
|
|
1884
2293
|
|
|
2294
|
+
assert(self.state_machine.commit_timestamp <= prepare.header.timestamp);
|
|
2295
|
+
self.state_machine.commit_timestamp = prepare.header.timestamp;
|
|
2296
|
+
|
|
1885
2297
|
self.commit_min += 1;
|
|
1886
2298
|
assert(self.commit_min == prepare.header.op);
|
|
1887
2299
|
if (self.commit_min > self.commit_max) self.commit_max = self.commit_min;
|
|
@@ -1901,10 +2313,10 @@ pub fn Replica(
|
|
|
1901
2313
|
.commit = prepare.header.op,
|
|
1902
2314
|
.size = @sizeOf(Header) + reply_body_size,
|
|
1903
2315
|
};
|
|
1904
|
-
assert(reply.header.
|
|
2316
|
+
assert(reply.header.timestamp == 0);
|
|
1905
2317
|
assert(reply.header.epoch == 0);
|
|
1906
2318
|
|
|
1907
|
-
reply.header.set_checksum_body(reply.
|
|
2319
|
+
reply.header.set_checksum_body(reply.body());
|
|
1908
2320
|
reply.header.set_checksum();
|
|
1909
2321
|
|
|
1910
2322
|
if (reply.header.operation == .register) {
|
|
@@ -1939,28 +2351,25 @@ pub fn Replica(
|
|
|
1939
2351
|
return;
|
|
1940
2352
|
}
|
|
1941
2353
|
|
|
1942
|
-
const count =
|
|
1943
|
-
&prepare.ok_from_all_replicas,
|
|
1944
|
-
.prepare_ok,
|
|
1945
|
-
prepare.message.header.checksum,
|
|
1946
|
-
);
|
|
2354
|
+
const count = prepare.ok_from_all_replicas.count();
|
|
1947
2355
|
assert(count >= self.quorum_replication);
|
|
2356
|
+
assert(count <= self.replica_count);
|
|
1948
2357
|
|
|
1949
|
-
|
|
1950
|
-
const reply = self.message_bus.get_message() orelse {
|
|
1951
|
-
// Eventually handled by on_prepare_timeout().
|
|
1952
|
-
log.err("{}: commit_pipeline: waiting for message", .{self.replica});
|
|
1953
|
-
return;
|
|
1954
|
-
};
|
|
1955
|
-
defer self.message_bus.unref(reply);
|
|
1956
|
-
|
|
1957
|
-
self.commit_op(prepare.message, reply);
|
|
2358
|
+
self.commit_op(prepare.message);
|
|
1958
2359
|
|
|
1959
2360
|
assert(self.commit_min == self.commit_max);
|
|
1960
2361
|
assert(self.commit_max == prepare.message.header.op);
|
|
1961
2362
|
|
|
1962
|
-
self.
|
|
1963
|
-
|
|
2363
|
+
self.message_bus.unref(self.pipeline.pop().?.message);
|
|
2364
|
+
|
|
2365
|
+
if (self.replica_count == 1) {
|
|
2366
|
+
if (self.pipeline.head_ptr()) |head| {
|
|
2367
|
+
// Write the next message in the queue.
|
|
2368
|
+
// A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
|
|
2369
|
+
self.write_prepare(head.message, .append);
|
|
2370
|
+
// The loop will wrap around and exit when `!ok_quorum_received`.
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
1964
2373
|
}
|
|
1965
2374
|
|
|
1966
2375
|
assert(self.prepare_timeout.ticking);
|
|
@@ -1968,6 +2377,39 @@ pub fn Replica(
|
|
|
1968
2377
|
if (self.pipeline.count == 0) self.prepare_timeout.stop();
|
|
1969
2378
|
}
|
|
1970
2379
|
|
|
2380
|
+
fn copy_latest_headers_and_set_size(
|
|
2381
|
+
self: *const Self,
|
|
2382
|
+
op_min: u64,
|
|
2383
|
+
op_max: u64,
|
|
2384
|
+
count_max: ?usize,
|
|
2385
|
+
message: *Message,
|
|
2386
|
+
) usize {
|
|
2387
|
+
assert(op_max >= op_min);
|
|
2388
|
+
assert(count_max == null or count_max.? > 0);
|
|
2389
|
+
assert(message.header.command == .do_view_change or
|
|
2390
|
+
message.header.command == .start_view or
|
|
2391
|
+
message.header.command == .headers or
|
|
2392
|
+
message.header.command == .recovery_response);
|
|
2393
|
+
|
|
2394
|
+
const body_size_max = @sizeOf(Header) * std.math.min(
|
|
2395
|
+
@divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
|
|
2396
|
+
// We must add 1 because op_max and op_min are both inclusive:
|
|
2397
|
+
count_max orelse std.math.min(64, op_max - op_min + 1),
|
|
2398
|
+
);
|
|
2399
|
+
assert(body_size_max >= @sizeOf(Header));
|
|
2400
|
+
assert(count_max == null or body_size_max == count_max.? * @sizeOf(Header));
|
|
2401
|
+
|
|
2402
|
+
const count = self.journal.copy_latest_headers_between(
|
|
2403
|
+
op_min,
|
|
2404
|
+
op_max,
|
|
2405
|
+
std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..][0..body_size_max]),
|
|
2406
|
+
);
|
|
2407
|
+
|
|
2408
|
+
message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
|
|
2409
|
+
|
|
2410
|
+
return count;
|
|
2411
|
+
}
|
|
2412
|
+
|
|
1971
2413
|
fn count_quorum(
|
|
1972
2414
|
self: *Self,
|
|
1973
2415
|
messages: *QuorumMessages,
|
|
@@ -1984,20 +2426,12 @@ pub fn Replica(
|
|
|
1984
2426
|
assert(m.header.context == context);
|
|
1985
2427
|
assert(m.header.replica == replica);
|
|
1986
2428
|
switch (command) {
|
|
1987
|
-
.prepare_ok => {
|
|
1988
|
-
if (self.status == .normal) {
|
|
1989
|
-
assert(self.leader());
|
|
1990
|
-
assert(m.header.view == self.view);
|
|
1991
|
-
} else {
|
|
1992
|
-
assert(self.status == .view_change);
|
|
1993
|
-
assert(m.header.view < self.view);
|
|
1994
|
-
}
|
|
1995
|
-
},
|
|
1996
2429
|
.start_view_change => {
|
|
1997
2430
|
assert(m.header.replica != self.replica);
|
|
1998
2431
|
assert(m.header.view == self.view);
|
|
1999
2432
|
},
|
|
2000
2433
|
.do_view_change => assert(m.header.view == self.view),
|
|
2434
|
+
.recovery_response => assert(m.header.replica != self.replica),
|
|
2001
2435
|
.nack_prepare => {
|
|
2002
2436
|
// TODO See if we can restrict this branch further.
|
|
2003
2437
|
assert(m.header.replica != self.replica);
|
|
@@ -2026,7 +2460,8 @@ pub fn Replica(
|
|
|
2026
2460
|
const session = reply.header.commit; // The commit number becomes the session number.
|
|
2027
2461
|
const request = reply.header.request;
|
|
2028
2462
|
|
|
2029
|
-
|
|
2463
|
+
// We reserved the `0` commit number for the cluster `.root` operation.
|
|
2464
|
+
assert(session > 0);
|
|
2030
2465
|
assert(request == 0);
|
|
2031
2466
|
|
|
2032
2467
|
// For correctness, it's critical that all replicas evict deterministically:
|
|
@@ -2090,13 +2525,13 @@ pub fn Replica(
|
|
|
2090
2525
|
}
|
|
2091
2526
|
|
|
2092
2527
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
2093
|
-
fn create_view_change_message(self: *Self, command: Command)
|
|
2528
|
+
fn create_view_change_message(self: *Self, command: Command) *Message {
|
|
2094
2529
|
assert(command == .do_view_change or command == .start_view);
|
|
2095
2530
|
|
|
2096
2531
|
// We may send a start_view message in normal status to resolve a follower's view jump:
|
|
2097
2532
|
assert(self.status == .normal or self.status == .view_change);
|
|
2098
2533
|
|
|
2099
|
-
const message = self.message_bus.get_message()
|
|
2534
|
+
const message = self.message_bus.get_message();
|
|
2100
2535
|
defer self.message_bus.unref(message);
|
|
2101
2536
|
|
|
2102
2537
|
message.header.* = .{
|
|
@@ -2107,8 +2542,8 @@ pub fn Replica(
|
|
|
2107
2542
|
// The latest normal view (as specified in the 2012 paper) is different to the view
|
|
2108
2543
|
// number contained in the prepare headers we include in the body. The former shows
|
|
2109
2544
|
// how recent a view change the replica participated in, which may be much higher.
|
|
2110
|
-
// We use the `
|
|
2111
|
-
.
|
|
2545
|
+
// We use the `timestamp` field to send this in addition to the current view number:
|
|
2546
|
+
.timestamp = if (command == .do_view_change) self.view_normal else 0,
|
|
2112
2547
|
.op = self.op,
|
|
2113
2548
|
.commit = self.commit_max,
|
|
2114
2549
|
};
|
|
@@ -2119,25 +2554,12 @@ pub fn Replica(
|
|
|
2119
2554
|
// that cannot be repaired because they are gaps, and this must be relative to the
|
|
2120
2555
|
// cluster as a whole (not relative to the difference between our op and commit number)
|
|
2121
2556
|
// as otherwise we would break correctness.
|
|
2122
|
-
const count_max = config.
|
|
2557
|
+
const count_max = config.pipeline_max;
|
|
2123
2558
|
assert(count_max > 0);
|
|
2124
2559
|
|
|
2125
|
-
const
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
);
|
|
2129
|
-
assert(size_max > @sizeOf(Header));
|
|
2130
|
-
|
|
2131
|
-
const count = self.journal.copy_latest_headers_between(
|
|
2132
|
-
0,
|
|
2133
|
-
self.op,
|
|
2134
|
-
std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..size_max]),
|
|
2135
|
-
);
|
|
2136
|
-
|
|
2137
|
-
// We expect that self.op always exists.
|
|
2138
|
-
assert(count > 0);
|
|
2139
|
-
|
|
2140
|
-
message.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
2560
|
+
const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
|
|
2561
|
+
assert(count > 0); // We expect that self.op always exists.
|
|
2562
|
+
assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
|
|
2141
2563
|
|
|
2142
2564
|
message.header.set_checksum_body(message.body());
|
|
2143
2565
|
message.header.set_checksum();
|
|
@@ -2146,12 +2568,14 @@ pub fn Replica(
|
|
|
2146
2568
|
}
|
|
2147
2569
|
|
|
2148
2570
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
2149
|
-
fn create_message_from_header(self: *Self, header: Header)
|
|
2571
|
+
fn create_message_from_header(self: *Self, header: Header) *Message {
|
|
2150
2572
|
assert(header.replica == self.replica);
|
|
2151
|
-
assert(header.view == self.view or
|
|
2573
|
+
assert(header.view == self.view or
|
|
2574
|
+
header.command == .request_start_view or
|
|
2575
|
+
header.command == .recovery);
|
|
2152
2576
|
assert(header.size == @sizeOf(Header));
|
|
2153
2577
|
|
|
2154
|
-
const message = self.message_bus.pool.
|
|
2578
|
+
const message = self.message_bus.pool.get_message();
|
|
2155
2579
|
defer self.message_bus.unref(message);
|
|
2156
2580
|
|
|
2157
2581
|
message.header.* = header;
|
|
@@ -2175,6 +2599,12 @@ pub fn Replica(
|
|
|
2175
2599
|
/// uncommitted header gaps and compare them with the quorum of do_view_change messages
|
|
2176
2600
|
/// received from other replicas, before starting the new view, to discard any that may be
|
|
2177
2601
|
/// impossible to repair.
|
|
2602
|
+
///
|
|
2603
|
+
/// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
|
|
2604
|
+
/// prepared on another replica before the old primary crashes, then this function finds a
|
|
2605
|
+
/// gap for ops=7,8 and will attempt to discard ops 7,8,9.
|
|
2606
|
+
// TODO To improve availability, potentially call this before the local headers are
|
|
2607
|
+
// repaired during the view change, so that we can participate in nacking headers.
|
|
2178
2608
|
fn discard_uncommitted_headers(self: *Self) void {
|
|
2179
2609
|
assert(self.status == .view_change);
|
|
2180
2610
|
assert(self.leader_index(self.view) == self.replica);
|
|
@@ -2182,6 +2612,7 @@ pub fn Replica(
|
|
|
2182
2612
|
assert(!self.repair_timeout.ticking);
|
|
2183
2613
|
assert(self.op >= self.commit_max);
|
|
2184
2614
|
assert(self.replica_count > 1);
|
|
2615
|
+
assert(self.op - self.commit_max <= config.journal_slot_count);
|
|
2185
2616
|
|
|
2186
2617
|
const threshold = self.replica_count - self.quorum_replication;
|
|
2187
2618
|
if (threshold == 0) {
|
|
@@ -2189,9 +2620,13 @@ pub fn Replica(
|
|
|
2189
2620
|
return;
|
|
2190
2621
|
}
|
|
2191
2622
|
|
|
2623
|
+
// Iterating > commit_max does not in itself guarantee that the header is uncommitted.
|
|
2624
|
+
// We must also count nacks from the quorum, since the old primary may have committed
|
|
2625
|
+
// another op just before crashing, if there was sufficient quorum. Counting nacks
|
|
2626
|
+
// ensures that the old primary could not possibly have committed the header.
|
|
2192
2627
|
var op = self.op;
|
|
2193
2628
|
while (op > self.commit_max) : (op -= 1) {
|
|
2194
|
-
if (self.journal.
|
|
2629
|
+
if (self.journal.header_with_op(op) != null) continue;
|
|
2195
2630
|
|
|
2196
2631
|
log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
|
|
2197
2632
|
|
|
@@ -2202,14 +2637,30 @@ pub fn Replica(
|
|
|
2202
2637
|
assert(m.header.cluster == self.cluster);
|
|
2203
2638
|
assert(m.header.replica == replica);
|
|
2204
2639
|
assert(m.header.view == self.view);
|
|
2640
|
+
assert(m.header.commit <= self.commit_max);
|
|
2205
2641
|
|
|
2206
2642
|
if (replica != self.replica) {
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
|
|
2643
|
+
// Check for a gap in the uncommitted headers from this replica.
|
|
2644
|
+
const received_headers = self.message_body_as_headers(m);
|
|
2645
|
+
assert(received_headers.len >= 1);
|
|
2646
|
+
|
|
2647
|
+
const received_op_min = received_headers[received_headers.len - 1].op;
|
|
2648
|
+
const received_op_max = received_headers[0].op;
|
|
2649
|
+
assert(received_op_max >= received_op_min);
|
|
2650
|
+
|
|
2651
|
+
const nack = for (received_headers) |*h| {
|
|
2652
|
+
if (h.op == op) break false;
|
|
2653
|
+
} else nack: {
|
|
2654
|
+
// Don't nack ops that didn't fit in the message's attached headers.
|
|
2655
|
+
break :nack op >= received_op_min;
|
|
2656
|
+
};
|
|
2657
|
+
|
|
2658
|
+
if (nack) nacks += 1;
|
|
2659
|
+
log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
|
|
2210
2660
|
self.replica,
|
|
2211
2661
|
m.header.replica,
|
|
2212
|
-
|
|
2662
|
+
op,
|
|
2663
|
+
nack,
|
|
2213
2664
|
});
|
|
2214
2665
|
}
|
|
2215
2666
|
}
|
|
@@ -2223,12 +2674,15 @@ pub fn Replica(
|
|
|
2223
2674
|
});
|
|
2224
2675
|
|
|
2225
2676
|
if (nacks >= threshold) {
|
|
2677
|
+
assert(op > self.commit_max);
|
|
2678
|
+
|
|
2226
2679
|
self.journal.remove_entries_from(op);
|
|
2227
2680
|
self.op = op - 1;
|
|
2228
2681
|
|
|
2229
|
-
|
|
2230
|
-
assert(
|
|
2231
|
-
assert(!self.journal.
|
|
2682
|
+
const slot = self.journal.slot_for_op(op);
|
|
2683
|
+
assert(self.journal.header_for_op(op) == null);
|
|
2684
|
+
assert(!self.journal.dirty.bit(slot));
|
|
2685
|
+
assert(!self.journal.faulty.bit(slot));
|
|
2232
2686
|
}
|
|
2233
2687
|
}
|
|
2234
2688
|
}
|
|
@@ -2243,10 +2697,11 @@ pub fn Replica(
|
|
|
2243
2697
|
|
|
2244
2698
|
assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
|
|
2245
2699
|
|
|
2700
|
+
const slot = self.journal.slot_with_op(op).?;
|
|
2246
2701
|
assert(op > self.commit_max);
|
|
2247
2702
|
assert(op <= self.op);
|
|
2248
|
-
assert(self.journal.
|
|
2249
|
-
assert(self.journal.dirty.bit(
|
|
2703
|
+
assert(self.journal.header_with_op_and_checksum(op, checksum) != null);
|
|
2704
|
+
assert(self.journal.dirty.bit(slot));
|
|
2250
2705
|
|
|
2251
2706
|
log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
|
|
2252
2707
|
self.replica,
|
|
@@ -2258,13 +2713,13 @@ pub fn Replica(
|
|
|
2258
2713
|
self.journal.remove_entries_from(op);
|
|
2259
2714
|
self.op = op - 1;
|
|
2260
2715
|
|
|
2261
|
-
assert(self.journal.
|
|
2262
|
-
assert(!self.journal.dirty.bit(
|
|
2263
|
-
assert(!self.journal.faulty.bit(
|
|
2716
|
+
assert(self.journal.header_for_op(op) == null);
|
|
2717
|
+
assert(!self.journal.dirty.bit(slot));
|
|
2718
|
+
assert(!self.journal.faulty.bit(slot));
|
|
2264
2719
|
|
|
2265
2720
|
// We require that `self.op` always exists. Rewinding `self.op` could change that.
|
|
2266
2721
|
// However, we do this only as the leader within a view change, with all headers intact.
|
|
2267
|
-
assert(self.journal.
|
|
2722
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
2268
2723
|
}
|
|
2269
2724
|
|
|
2270
2725
|
/// Returns whether the replica is a follower for the current view.
|
|
@@ -2364,7 +2819,7 @@ pub fn Replica(
|
|
|
2364
2819
|
return true;
|
|
2365
2820
|
},
|
|
2366
2821
|
// Only the leader may answer a request for a prepare without a context:
|
|
2367
|
-
.request_prepare => if (message.header.
|
|
2822
|
+
.request_prepare => if (message.header.timestamp == 0) {
|
|
2368
2823
|
log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
|
|
2369
2824
|
return true;
|
|
2370
2825
|
},
|
|
@@ -2433,6 +2888,18 @@ pub fn Replica(
|
|
|
2433
2888
|
if (self.ignore_request_message_follower(message)) return true;
|
|
2434
2889
|
if (self.ignore_request_message_duplicate(message)) return true;
|
|
2435
2890
|
if (self.ignore_request_message_preparing(message)) return true;
|
|
2891
|
+
|
|
2892
|
+
// Verify that the new request will fit in the WAL.
|
|
2893
|
+
// The message's op hasn't been assigned yet, but it will be `self.op + 1`.
|
|
2894
|
+
if (self.op + 1 >= self.op_checkpoint + config.journal_slot_count) {
|
|
2895
|
+
log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
2896
|
+
self.replica,
|
|
2897
|
+
message.header.op,
|
|
2898
|
+
self.op_checkpoint,
|
|
2899
|
+
});
|
|
2900
|
+
return true;
|
|
2901
|
+
}
|
|
2902
|
+
|
|
2436
2903
|
return false;
|
|
2437
2904
|
}
|
|
2438
2905
|
|
|
@@ -2485,7 +2952,9 @@ pub fn Replica(
|
|
|
2485
2952
|
return false;
|
|
2486
2953
|
} else {
|
|
2487
2954
|
// The client may have only one request inflight at a time.
|
|
2488
|
-
log.err("{}: on_request: ignoring new request (client bug)", .{
|
|
2955
|
+
log.err("{}: on_request: ignoring new request (client bug)", .{
|
|
2956
|
+
self.replica,
|
|
2957
|
+
});
|
|
2489
2958
|
return true;
|
|
2490
2959
|
}
|
|
2491
2960
|
} else {
|
|
@@ -2636,7 +3105,71 @@ pub fn Replica(
|
|
|
2636
3105
|
return false;
|
|
2637
3106
|
}
|
|
2638
3107
|
|
|
2639
|
-
|
|
3108
|
+
/// Returns whether the highest known op is certain.
|
|
3109
|
+
///
|
|
3110
|
+
/// After recovering the WAL, there are 2 possible outcomes:
|
|
3111
|
+
/// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
|
|
3112
|
+
/// * One or more entries are faulty. The highest op isn't certain — it may be one of the
|
|
3113
|
+
/// broken entries.
|
|
3114
|
+
///
|
|
3115
|
+
/// The replica must refrain from repairing any faulty slots until the highest op is known.
|
|
3116
|
+
/// Otherwise, if we were to repair a slot while uncertain of `replica.op`:
|
|
3117
|
+
///
|
|
3118
|
+
/// * we may nack an op that we shouldn't, or
|
|
3119
|
+
/// * we may replace a prepared op that we were guaranteeing for the primary, potentially
|
|
3120
|
+
/// forking the log.
|
|
3121
|
+
///
|
|
3122
|
+
///
|
|
3123
|
+
/// Test for a fault the right of the current op. The fault might be our true op, and
|
|
3124
|
+
/// sharing our current `replica.op` might cause the cluster's op to likewise regress.
|
|
3125
|
+
///
|
|
3126
|
+
/// Note that for our purposes here, we only care about entries that were faulty during
|
|
3127
|
+
/// WAL recovery, not ones that were found to be faulty after the fact (e.g. due to
|
|
3128
|
+
/// `request_prepare`).
|
|
3129
|
+
///
|
|
3130
|
+
/// Cases (`✓`: `replica.op_checkpoint`, `✗`: faulty, `o`: `replica.op`):
|
|
3131
|
+
/// * ` ✓ o ✗ `: View change is unsafe.
|
|
3132
|
+
/// * ` ✗ ✓ o `: View change is unsafe.
|
|
3133
|
+
/// * ` ✓ ✗ o `: View change is safe.
|
|
3134
|
+
/// * ` ✓ = o `: View change is unsafe if any slots are faulty.
|
|
3135
|
+
/// (`replica.op_checkpoint` == `replica.op`).
|
|
3136
|
+
// TODO Use this function once we switch from recovery protocol to the superblock.
|
|
3137
|
+
// If there is an "unsafe" fault, we will need to request a start_view from the leader to
|
|
3138
|
+
// learn the op.
|
|
3139
|
+
fn op_certain(self: *const Self) bool {
|
|
3140
|
+
assert(self.status == .recovering);
|
|
3141
|
+
assert(self.journal.recovered);
|
|
3142
|
+
assert(self.op_checkpoint <= self.op);
|
|
3143
|
+
|
|
3144
|
+
const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
|
|
3145
|
+
const slot_op = self.journal.slot_with_op(self.op).?.index;
|
|
3146
|
+
const slot_known_range = vsr.SlotRange{
|
|
3147
|
+
.head = slot_op_checkpoint,
|
|
3148
|
+
.tail = slot_op,
|
|
3149
|
+
};
|
|
3150
|
+
|
|
3151
|
+
var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
|
|
3152
|
+
while (iterator.next()) |slot| {
|
|
3153
|
+
// The command is `reserved` when the entry was found faulty during WAL recovery.
|
|
3154
|
+
// Faults found after WAL recovery are not relevant, because we know their op.
|
|
3155
|
+
if (self.journal.headers[slot.index].command == .reserved) {
|
|
3156
|
+
if (slot_op_checkpoint == slot_op or
|
|
3157
|
+
!slot_known_range.contains(slot))
|
|
3158
|
+
{
|
|
3159
|
+
log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
|
|
3160
|
+
self.replica,
|
|
3161
|
+
slot.index,
|
|
3162
|
+
self.op,
|
|
3163
|
+
self.op_checkpoint,
|
|
3164
|
+
});
|
|
3165
|
+
return false;
|
|
3166
|
+
}
|
|
3167
|
+
}
|
|
3168
|
+
}
|
|
3169
|
+
return true;
|
|
3170
|
+
}
|
|
3171
|
+
|
|
3172
|
+
fn is_repair(self: *const Self, message: *const Message) bool {
|
|
2640
3173
|
assert(message.header.command == .prepare);
|
|
2641
3174
|
|
|
2642
3175
|
if (self.status == .normal) {
|
|
@@ -2668,15 +3201,17 @@ pub fn Replica(
|
|
|
2668
3201
|
assert(self.follower());
|
|
2669
3202
|
assert(header.view == self.view);
|
|
2670
3203
|
assert(header.op > self.op + 1);
|
|
2671
|
-
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
2672
|
-
// newer op that is less than `commit_max` but greater than `commit_min`:
|
|
3204
|
+
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
3205
|
+
// to a newer op that is less than `commit_max` but greater than `commit_min`:
|
|
2673
3206
|
assert(header.op > self.commit_min);
|
|
3207
|
+
// Never overwrite an op that still needs to be checkpointed.
|
|
3208
|
+
assert(header.op - self.op_checkpoint < config.journal_slot_count);
|
|
2674
3209
|
|
|
2675
3210
|
log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
|
|
2676
3211
|
self.replica,
|
|
2677
3212
|
self.op,
|
|
2678
3213
|
header.op - 1,
|
|
2679
|
-
self.journal.
|
|
3214
|
+
self.journal.header_with_op(self.op).?.checksum,
|
|
2680
3215
|
header.parent,
|
|
2681
3216
|
});
|
|
2682
3217
|
|
|
@@ -2688,7 +3223,10 @@ pub fn Replica(
|
|
|
2688
3223
|
fn message_body_as_headers(_: *Self, message: *const Message) []Header {
|
|
2689
3224
|
// TODO Assert message commands that we expect this to be called for.
|
|
2690
3225
|
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
2691
|
-
return std.mem.bytesAsSlice(
|
|
3226
|
+
return std.mem.bytesAsSlice(
|
|
3227
|
+
Header,
|
|
3228
|
+
message.buffer[@sizeOf(Header)..message.header.size],
|
|
3229
|
+
);
|
|
2692
3230
|
}
|
|
2693
3231
|
|
|
2694
3232
|
/// Panics if immediate neighbors in the same view would have a broken hash chain.
|
|
@@ -2710,6 +3248,29 @@ pub fn Replica(
|
|
|
2710
3248
|
}
|
|
2711
3249
|
}
|
|
2712
3250
|
|
|
3251
|
+
/// Searches the pipeline for a prepare for a given op and checksum.
|
|
3252
|
+
/// When `checksum` is `null`, match any checksum.
|
|
3253
|
+
fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
|
|
3254
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
3255
|
+
|
|
3256
|
+
// To optimize the search, we can leverage the fact that the pipeline is ordered and
|
|
3257
|
+
// continuous.
|
|
3258
|
+
if (self.pipeline.count == 0) return null;
|
|
3259
|
+
const head_op = self.pipeline.head_ptr().?.message.header.op;
|
|
3260
|
+
const tail_op = self.pipeline.tail_ptr().?.message.header.op;
|
|
3261
|
+
if (op < head_op) return null;
|
|
3262
|
+
if (op > tail_op) return null;
|
|
3263
|
+
|
|
3264
|
+
const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
|
|
3265
|
+
assert(pipeline_prepare.message.header.op == op);
|
|
3266
|
+
|
|
3267
|
+
if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
|
|
3268
|
+
return pipeline_prepare;
|
|
3269
|
+
} else {
|
|
3270
|
+
return null;
|
|
3271
|
+
}
|
|
3272
|
+
}
|
|
3273
|
+
|
|
2713
3274
|
/// Searches the pipeline for a prepare for a given client.
|
|
2714
3275
|
fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
|
|
2715
3276
|
assert(self.status == .normal);
|
|
@@ -2717,7 +3278,7 @@ pub fn Replica(
|
|
|
2717
3278
|
assert(self.commit_min == self.commit_max);
|
|
2718
3279
|
|
|
2719
3280
|
var op = self.commit_max + 1;
|
|
2720
|
-
var parent = self.journal.
|
|
3281
|
+
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
2721
3282
|
var iterator = self.pipeline.iterator();
|
|
2722
3283
|
while (iterator.next_ptr()) |prepare| {
|
|
2723
3284
|
assert(prepare.message.header.command == .prepare);
|
|
@@ -2732,7 +3293,7 @@ pub fn Replica(
|
|
|
2732
3293
|
op += 1;
|
|
2733
3294
|
}
|
|
2734
3295
|
|
|
2735
|
-
assert(self.pipeline.count <= config.
|
|
3296
|
+
assert(self.pipeline.count <= config.pipeline_max);
|
|
2736
3297
|
assert(self.commit_max + self.pipeline.count == op - 1);
|
|
2737
3298
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
2738
3299
|
|
|
@@ -2770,15 +3331,33 @@ pub fn Replica(
|
|
|
2770
3331
|
assert(prepare.message.header.view <= ok.header.view);
|
|
2771
3332
|
assert(prepare.message.header.op == ok.header.op);
|
|
2772
3333
|
assert(prepare.message.header.commit == ok.header.commit);
|
|
2773
|
-
assert(prepare.message.header.
|
|
3334
|
+
assert(prepare.message.header.timestamp == ok.header.timestamp);
|
|
2774
3335
|
assert(prepare.message.header.operation == ok.header.operation);
|
|
2775
3336
|
|
|
2776
3337
|
return prepare;
|
|
2777
3338
|
}
|
|
2778
3339
|
|
|
3340
|
+
fn recover(self: *Self) void {
|
|
3341
|
+
assert(self.status == .recovering);
|
|
3342
|
+
assert(self.replica_count > 1);
|
|
3343
|
+
assert(self.journal.recovered);
|
|
3344
|
+
|
|
3345
|
+
log.debug("{}: recover: sending recovery messages nonce={}", .{
|
|
3346
|
+
self.replica,
|
|
3347
|
+
self.recovery_nonce,
|
|
3348
|
+
});
|
|
3349
|
+
|
|
3350
|
+
self.send_header_to_other_replicas(.{
|
|
3351
|
+
.command = .recovery,
|
|
3352
|
+
.cluster = self.cluster,
|
|
3353
|
+
.context = self.recovery_nonce,
|
|
3354
|
+
.replica = self.replica,
|
|
3355
|
+
});
|
|
3356
|
+
}
|
|
3357
|
+
|
|
2779
3358
|
/// Starting from the latest journal entry, backfill any missing or disconnected headers.
|
|
2780
|
-
/// A header is disconnected if it breaks the
|
|
2781
|
-
/// Since we work
|
|
3359
|
+
/// A header is disconnected if it breaks the chain with its newer neighbor to the right.
|
|
3360
|
+
/// Since we work back from the latest entry, we should always be able to fix the chain.
|
|
2782
3361
|
/// Once headers are connected, backfill any dirty or faulty prepares.
|
|
2783
3362
|
fn repair(self: *Self) void {
|
|
2784
3363
|
if (!self.repair_timeout.ticking) {
|
|
@@ -2790,38 +3369,50 @@ pub fn Replica(
|
|
|
2790
3369
|
|
|
2791
3370
|
assert(self.status == .normal or self.status == .view_change);
|
|
2792
3371
|
assert(self.repairs_allowed());
|
|
3372
|
+
|
|
3373
|
+
assert(self.op_checkpoint <= self.op);
|
|
3374
|
+
assert(self.op_checkpoint <= self.commit_min);
|
|
2793
3375
|
assert(self.commit_min <= self.op);
|
|
2794
3376
|
assert(self.commit_min <= self.commit_max);
|
|
2795
3377
|
|
|
2796
|
-
|
|
2797
|
-
assert(self.journal.
|
|
2798
|
-
|
|
3378
|
+
assert(self.journal.header_with_op(self.commit_min) != null);
|
|
3379
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
3380
|
+
|
|
3381
|
+
// The replica repairs backwards from `commit_max`. But if `commit_max` is too high
|
|
3382
|
+
// (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
|
|
3383
|
+
const commit_max_limit = std.math.min(
|
|
3384
|
+
self.commit_max,
|
|
3385
|
+
self.op_checkpoint + config.journal_slot_count,
|
|
3386
|
+
);
|
|
2799
3387
|
|
|
2800
3388
|
// Request outstanding committed prepares to advance our op number:
|
|
2801
3389
|
// This handles the case of an idle cluster, where a follower will not otherwise advance.
|
|
2802
3390
|
// This is not required for correctness, but for durability.
|
|
2803
|
-
if (self.op <
|
|
3391
|
+
if (self.op < commit_max_limit) {
|
|
2804
3392
|
// If the leader repairs during a view change, it will have already advanced
|
|
2805
3393
|
// `self.op` to the latest op according to the quorum of `do_view_change` messages
|
|
2806
3394
|
// received, so we must therefore be a follower in normal status:
|
|
2807
3395
|
assert(self.status == .normal);
|
|
2808
3396
|
assert(self.follower());
|
|
2809
|
-
log.debug("{}: repair: op={} < commit_max={}", .{
|
|
3397
|
+
log.debug("{}: repair: op={} < commit_max_limit={}, commit_max={}", .{
|
|
2810
3398
|
self.replica,
|
|
2811
3399
|
self.op,
|
|
3400
|
+
commit_max_limit,
|
|
2812
3401
|
self.commit_max,
|
|
2813
3402
|
});
|
|
2814
3403
|
// We need to advance our op number and therefore have to `request_prepare`,
|
|
2815
3404
|
// since only `on_prepare()` can do this, not `repair_header()` in `on_headers()`.
|
|
2816
3405
|
self.send_header_to_replica(self.leader_index(self.view), .{
|
|
2817
3406
|
.command = .request_prepare,
|
|
2818
|
-
// We cannot yet know the checksum of the prepare so we set the context
|
|
2819
|
-
// Context is optional when requesting from the leader but
|
|
3407
|
+
// We cannot yet know the checksum of the prepare so we set the context and
|
|
3408
|
+
// timestamp to 0: Context is optional when requesting from the leader but
|
|
3409
|
+
// required otherwise.
|
|
2820
3410
|
.context = 0,
|
|
3411
|
+
.timestamp = 0,
|
|
2821
3412
|
.cluster = self.cluster,
|
|
2822
3413
|
.replica = self.replica,
|
|
2823
3414
|
.view = self.view,
|
|
2824
|
-
.op =
|
|
3415
|
+
.op = commit_max_limit,
|
|
2825
3416
|
});
|
|
2826
3417
|
return;
|
|
2827
3418
|
}
|
|
@@ -2842,9 +3433,10 @@ pub fn Replica(
|
|
|
2842
3433
|
assert(range.op_min > self.commit_min);
|
|
2843
3434
|
assert(range.op_max < self.op);
|
|
2844
3435
|
// A range of `op_min=0` or `op_max=0` should be impossible as a header break:
|
|
2845
|
-
// This is the
|
|
3436
|
+
// This is the root op that is prepared when the cluster is initialized.
|
|
2846
3437
|
assert(range.op_min > 0);
|
|
2847
3438
|
assert(range.op_max > 0);
|
|
3439
|
+
|
|
2848
3440
|
if (self.choose_any_other_replica()) |replica| {
|
|
2849
3441
|
self.send_header_to_replica(replica, .{
|
|
2850
3442
|
.command = .request_headers,
|
|
@@ -2863,10 +3455,14 @@ pub fn Replica(
|
|
|
2863
3455
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
2864
3456
|
|
|
2865
3457
|
// Request and repair any dirty or faulty prepares:
|
|
2866
|
-
if (self.journal.dirty.
|
|
3458
|
+
if (self.journal.dirty.count > 0) return self.repair_prepares();
|
|
2867
3459
|
|
|
2868
3460
|
// Commit ops, which may in turn discover faulty prepares and drive more repairs:
|
|
2869
|
-
if (self.commit_min < self.commit_max)
|
|
3461
|
+
if (self.commit_min < self.commit_max) {
|
|
3462
|
+
assert(self.replica_count > 1);
|
|
3463
|
+
self.commit_ops(self.commit_max);
|
|
3464
|
+
return;
|
|
3465
|
+
}
|
|
2870
3466
|
|
|
2871
3467
|
if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
|
|
2872
3468
|
if (self.repair_pipeline_op() != null) return self.repair_pipeline();
|
|
@@ -2921,10 +3517,13 @@ pub fn Replica(
|
|
|
2921
3517
|
}
|
|
2922
3518
|
|
|
2923
3519
|
if (header.op > self.op) {
|
|
2924
|
-
log.debug("{}: repair_header: false (advances self.op)", .{
|
|
3520
|
+
log.debug("{}: repair_header: false (advances self.op={})", .{
|
|
3521
|
+
self.replica,
|
|
3522
|
+
self.op,
|
|
3523
|
+
});
|
|
2925
3524
|
return false;
|
|
2926
3525
|
} else if (header.op == self.op) {
|
|
2927
|
-
if (self.journal.
|
|
3526
|
+
if (self.journal.header_with_op_and_checksum(self.op, header.checksum)) |_| {
|
|
2928
3527
|
// Fall through below to check if self.op is uncommitted AND reordered,
|
|
2929
3528
|
// which we would see by the presence of an earlier op with higher view number,
|
|
2930
3529
|
// that breaks the chain with self.op. In this case, we must skip the repair to
|
|
@@ -2938,27 +3537,42 @@ pub fn Replica(
|
|
|
2938
3537
|
}
|
|
2939
3538
|
}
|
|
2940
3539
|
|
|
2941
|
-
if (self.journal.
|
|
3540
|
+
if (self.journal.header_for_entry(header)) |existing| {
|
|
3541
|
+
assert(existing.op == header.op);
|
|
3542
|
+
|
|
2942
3543
|
// Do not replace any existing op lightly as doing so may impair durability and even
|
|
2943
3544
|
// violate correctness by undoing a prepare already acknowledged to the leader:
|
|
2944
3545
|
if (existing.checksum == header.checksum) {
|
|
2945
|
-
|
|
2946
|
-
|
|
3546
|
+
const slot = self.journal.slot_with_header(header).?;
|
|
3547
|
+
if (!self.journal.dirty.bit(slot)) {
|
|
3548
|
+
log.debug("{}: repair_header: op={} false (checksum clean)", .{
|
|
3549
|
+
self.replica,
|
|
3550
|
+
header.op,
|
|
3551
|
+
});
|
|
2947
3552
|
return false;
|
|
2948
3553
|
}
|
|
2949
3554
|
|
|
2950
|
-
log.debug("{}: repair_header: exists, checksum dirty", .{
|
|
3555
|
+
log.debug("{}: repair_header: op={} exists, checksum dirty", .{
|
|
3556
|
+
self.replica,
|
|
3557
|
+
header.op,
|
|
3558
|
+
});
|
|
2951
3559
|
} else if (existing.view == header.view) {
|
|
2952
3560
|
// The journal must have wrapped:
|
|
2953
3561
|
// We expect that the same view and op will have the same checksum.
|
|
2954
3562
|
assert(existing.op != header.op);
|
|
2955
3563
|
|
|
2956
3564
|
if (existing.op > header.op) {
|
|
2957
|
-
log.debug("{}: repair_header: false (view has newer op)", .{
|
|
3565
|
+
log.debug("{}: repair_header: op={} false (view has newer op)", .{
|
|
3566
|
+
self.replica,
|
|
3567
|
+
header.op,
|
|
3568
|
+
});
|
|
2958
3569
|
return false;
|
|
2959
3570
|
}
|
|
2960
3571
|
|
|
2961
|
-
log.debug("{}: repair_header: exists, view has older op", .{
|
|
3572
|
+
log.debug("{}: repair_header: op={} exists, view has older op", .{
|
|
3573
|
+
self.replica,
|
|
3574
|
+
header.op,
|
|
3575
|
+
});
|
|
2962
3576
|
} else {
|
|
2963
3577
|
assert(existing.view != header.view);
|
|
2964
3578
|
assert(existing.op == header.op or existing.op != header.op);
|
|
@@ -2966,38 +3580,37 @@ pub fn Replica(
|
|
|
2966
3580
|
if (!self.repair_header_would_connect_hash_chain(header)) {
|
|
2967
3581
|
// We cannot replace this op until we are sure that doing so would not
|
|
2968
3582
|
// violate any prior commitments made to the leader.
|
|
2969
|
-
log.debug("{}: repair_header: false (exists)", .{
|
|
3583
|
+
log.debug("{}: repair_header: op={} false (exists)", .{
|
|
3584
|
+
self.replica,
|
|
3585
|
+
header.op,
|
|
3586
|
+
});
|
|
2970
3587
|
return false;
|
|
2971
3588
|
}
|
|
2972
3589
|
|
|
2973
|
-
log.debug("{}: repair_header: exists, connects hash chain", .{
|
|
3590
|
+
log.debug("{}: repair_header: op={} exists, connects hash chain", .{
|
|
3591
|
+
self.replica,
|
|
3592
|
+
header.op,
|
|
3593
|
+
});
|
|
2974
3594
|
}
|
|
2975
3595
|
} else {
|
|
2976
|
-
log.debug("{}: repair_header: gap", .{self.replica});
|
|
3596
|
+
log.debug("{}: repair_header: op={} gap", .{ self.replica, header.op });
|
|
2977
3597
|
}
|
|
2978
3598
|
|
|
2979
3599
|
// Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
|
|
2980
3600
|
if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
|
|
2981
|
-
log.debug("{}: repair_header: false (breaks hash chain)", .{
|
|
3601
|
+
log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
|
|
3602
|
+
self.replica,
|
|
3603
|
+
header.op,
|
|
3604
|
+
});
|
|
2982
3605
|
return false;
|
|
2983
3606
|
}
|
|
2984
3607
|
|
|
2985
|
-
// Caveat: Do not repair an existing op or gap if doing so would overlap another:
|
|
2986
|
-
if (self.repair_header_would_overlap_another(header)) {
|
|
2987
|
-
if (!self.repair_header_would_connect_hash_chain(header)) {
|
|
2988
|
-
log.debug("{}: repair_header: false (overlap)", .{self.replica});
|
|
2989
|
-
return false;
|
|
2990
|
-
}
|
|
2991
|
-
// We may have to overlap previous entries in order to connect the hash chain:
|
|
2992
|
-
log.debug("{}: repair_header: overlap, connects hash chain", .{self.replica});
|
|
2993
|
-
}
|
|
2994
|
-
|
|
2995
3608
|
// TODO Snapshots: Skip if this header is already snapshotted.
|
|
2996
3609
|
|
|
2997
3610
|
assert(header.op < self.op or
|
|
2998
|
-
self.journal.
|
|
3611
|
+
self.journal.header_with_op(self.op).?.checksum == header.checksum);
|
|
2999
3612
|
|
|
3000
|
-
self.journal.
|
|
3613
|
+
self.journal.set_header_as_dirty(header);
|
|
3001
3614
|
return true;
|
|
3002
3615
|
}
|
|
3003
3616
|
|
|
@@ -3018,10 +3631,12 @@ pub fn Replica(
|
|
|
3018
3631
|
if (header.checksum == next.parent) {
|
|
3019
3632
|
assert(header.view <= next.view);
|
|
3020
3633
|
assert(header.op + 1 == next.op);
|
|
3021
|
-
// We don't break with `next` but this is no guarantee that `next` does not
|
|
3634
|
+
// We don't break with `next` but this is no guarantee that `next` does not
|
|
3635
|
+
// break.
|
|
3022
3636
|
return false;
|
|
3023
3637
|
} else {
|
|
3024
|
-
// If the journal has wrapped, then err in favor of a break regardless of op
|
|
3638
|
+
// If the journal has wrapped, then err in favor of a break regardless of op
|
|
3639
|
+
// order:
|
|
3025
3640
|
return true;
|
|
3026
3641
|
}
|
|
3027
3642
|
}
|
|
@@ -3030,14 +3645,17 @@ pub fn Replica(
|
|
|
3030
3645
|
return false;
|
|
3031
3646
|
}
|
|
3032
3647
|
|
|
3033
|
-
/// If we repair this header, then would this connect the hash chain through to the latest
|
|
3034
|
-
/// This offers a strong guarantee that may be used to replace or overlap an existing
|
|
3648
|
+
/// If we repair this header, then would this connect the hash chain through to the latest
|
|
3649
|
+
/// op? This offers a strong guarantee that may be used to replace or overlap an existing
|
|
3650
|
+
/// op.
|
|
3035
3651
|
///
|
|
3036
3652
|
/// Here is an example of what could go wrong if we did not check for complete connection:
|
|
3037
3653
|
///
|
|
3038
3654
|
/// 1. We do a prepare that's going to be committed.
|
|
3039
|
-
/// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
|
|
3040
|
-
///
|
|
3655
|
+
/// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
|
|
3656
|
+
/// left.
|
|
3657
|
+
/// 3. We do another stale prepare that replaces the first op because it connects to the
|
|
3658
|
+
/// second.
|
|
3041
3659
|
///
|
|
3042
3660
|
/// This would violate our quorum replication commitment to the leader.
|
|
3043
3661
|
/// The mistake in this example was not that we ignored the break to the left, which we must
|
|
@@ -3060,43 +3678,16 @@ pub fn Replica(
|
|
|
3060
3678
|
}
|
|
3061
3679
|
|
|
3062
3680
|
assert(entry.op == self.op);
|
|
3063
|
-
assert(entry.checksum == self.journal.
|
|
3681
|
+
assert(entry.checksum == self.journal.header_with_op(self.op).?.checksum);
|
|
3064
3682
|
return true;
|
|
3065
3683
|
}
|
|
3066
3684
|
|
|
3067
|
-
/// If we repair this header, then would this overlap and overwrite part of another batch?
|
|
3068
|
-
/// Journal entries have variable-sized batches that may overlap if entries are disconnected.
|
|
3069
|
-
fn repair_header_would_overlap_another(self: *Self, header: *const Header) bool {
|
|
3070
|
-
// TODO Snapshots: Handle journal wrap around.
|
|
3071
|
-
{
|
|
3072
|
-
// Look behind this entry for any preceeding entry that this would overlap:
|
|
3073
|
-
var op: u64 = header.op;
|
|
3074
|
-
while (op > 0) {
|
|
3075
|
-
op -= 1;
|
|
3076
|
-
if (self.journal.entry_for_op(op)) |neighbor| {
|
|
3077
|
-
if (Journal.next_offset(neighbor) > header.offset) return true;
|
|
3078
|
-
break;
|
|
3079
|
-
}
|
|
3080
|
-
}
|
|
3081
|
-
}
|
|
3082
|
-
{
|
|
3083
|
-
// Look beyond this entry for any succeeding entry that this would overlap:
|
|
3084
|
-
var op: u64 = header.op + 1;
|
|
3085
|
-
while (op <= self.op) : (op += 1) {
|
|
3086
|
-
if (self.journal.entry_for_op(op)) |neighbor| {
|
|
3087
|
-
if (Journal.next_offset(header) > neighbor.offset) return true;
|
|
3088
|
-
break;
|
|
3089
|
-
}
|
|
3090
|
-
}
|
|
3091
|
-
}
|
|
3092
|
-
return false;
|
|
3093
|
-
}
|
|
3094
|
-
|
|
3095
3685
|
/// Reads prepares into the pipeline (before we start the view as the new leader).
|
|
3096
3686
|
fn repair_pipeline(self: *Self) void {
|
|
3097
3687
|
assert(self.status == .view_change);
|
|
3098
3688
|
assert(self.leader_index(self.view) == self.replica);
|
|
3099
3689
|
assert(self.commit_max < self.op);
|
|
3690
|
+
assert(self.journal.dirty.count == 0);
|
|
3100
3691
|
|
|
3101
3692
|
if (self.repairing_pipeline) {
|
|
3102
3693
|
log.debug("{}: repair_pipeline: already repairing...", .{self.replica});
|
|
@@ -3111,11 +3702,57 @@ pub fn Replica(
|
|
|
3111
3702
|
self.repair_pipeline_read();
|
|
3112
3703
|
}
|
|
3113
3704
|
|
|
3705
|
+
/// Discard messages from the prepare pipeline.
|
|
3706
|
+
/// Retain uncommitted messages that belong in the current view to maximize durability.
|
|
3707
|
+
fn repair_pipeline_diff(self: *Self) void {
|
|
3708
|
+
assert(self.status == .view_change);
|
|
3709
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
3710
|
+
|
|
3711
|
+
// Discard messages from the front of the pipeline that committed since we were leader.
|
|
3712
|
+
while (self.pipeline.head_ptr()) |prepare| {
|
|
3713
|
+
if (prepare.message.header.op > self.commit_max) break;
|
|
3714
|
+
|
|
3715
|
+
self.message_bus.unref(self.pipeline.pop().?.message);
|
|
3716
|
+
}
|
|
3717
|
+
|
|
3718
|
+
// Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
|
|
3719
|
+
if (self.pipeline.head_ptr()) |pipeline_head| {
|
|
3720
|
+
const parent = self.journal.header_with_op_and_checksum(
|
|
3721
|
+
pipeline_head.message.header.op - 1,
|
|
3722
|
+
pipeline_head.message.header.parent,
|
|
3723
|
+
);
|
|
3724
|
+
if (parent == null) {
|
|
3725
|
+
while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
|
|
3726
|
+
assert(self.pipeline.count == 0);
|
|
3727
|
+
}
|
|
3728
|
+
}
|
|
3729
|
+
|
|
3730
|
+
// Discard messages from the back of the pipeline that are not part of this view.
|
|
3731
|
+
while (self.pipeline.tail_ptr()) |prepare| {
|
|
3732
|
+
if (self.journal.has(prepare.message.header)) break;
|
|
3733
|
+
|
|
3734
|
+
self.message_bus.unref(self.pipeline.pop_tail().?.message);
|
|
3735
|
+
}
|
|
3736
|
+
|
|
3737
|
+
log.debug("{}: repair_pipeline_diff: {} prepare(s)", .{
|
|
3738
|
+
self.replica,
|
|
3739
|
+
self.pipeline.count,
|
|
3740
|
+
});
|
|
3741
|
+
|
|
3742
|
+
self.verify_pipeline();
|
|
3743
|
+
|
|
3744
|
+
// Do not reset `repairing_pipeline` here as this must be reset by the read callback.
|
|
3745
|
+
// Otherwise, we would be making `repair_pipeline()` reentrant.
|
|
3746
|
+
}
|
|
3747
|
+
|
|
3114
3748
|
/// Returns the next `op` number that needs to be read into the pipeline.
|
|
3115
3749
|
fn repair_pipeline_op(self: *Self) ?u64 {
|
|
3116
3750
|
assert(self.status == .view_change);
|
|
3117
3751
|
assert(self.leader_index(self.view) == self.replica);
|
|
3118
3752
|
|
|
3753
|
+
// We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
|
|
3754
|
+
self.repair_pipeline_diff();
|
|
3755
|
+
|
|
3119
3756
|
const op = self.commit_max + self.pipeline.count + 1;
|
|
3120
3757
|
if (op <= self.op) return op;
|
|
3121
3758
|
|
|
@@ -3133,7 +3770,7 @@ pub fn Replica(
|
|
|
3133
3770
|
assert(op <= self.op);
|
|
3134
3771
|
assert(self.commit_max + self.pipeline.count + 1 == op);
|
|
3135
3772
|
|
|
3136
|
-
const checksum = self.journal.
|
|
3773
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
3137
3774
|
|
|
3138
3775
|
log.debug("{}: repair_pipeline_read: op={} checksum={}", .{
|
|
3139
3776
|
self.replica,
|
|
@@ -3192,7 +3829,7 @@ pub fn Replica(
|
|
|
3192
3829
|
return;
|
|
3193
3830
|
}
|
|
3194
3831
|
|
|
3195
|
-
if (prepare.?.header.checksum != self.journal.
|
|
3832
|
+
if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
|
|
3196
3833
|
log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
|
|
3197
3834
|
return;
|
|
3198
3835
|
}
|
|
@@ -3206,7 +3843,11 @@ pub fn Replica(
|
|
|
3206
3843
|
prepare.?.header.checksum,
|
|
3207
3844
|
});
|
|
3208
3845
|
|
|
3209
|
-
self.pipeline.
|
|
3846
|
+
if (self.pipeline.tail_ptr()) |parent| {
|
|
3847
|
+
assert(prepare.?.header.parent == parent.message.header.checksum);
|
|
3848
|
+
}
|
|
3849
|
+
|
|
3850
|
+
self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
|
|
3210
3851
|
assert(self.pipeline.count >= 1);
|
|
3211
3852
|
|
|
3212
3853
|
self.repairing_pipeline = true;
|
|
@@ -3216,7 +3857,7 @@ pub fn Replica(
|
|
|
3216
3857
|
fn repair_prepares(self: *Self) void {
|
|
3217
3858
|
assert(self.status == .normal or self.status == .view_change);
|
|
3218
3859
|
assert(self.repairs_allowed());
|
|
3219
|
-
assert(self.journal.dirty.
|
|
3860
|
+
assert(self.journal.dirty.count > 0);
|
|
3220
3861
|
|
|
3221
3862
|
// Request enough prepares to utilize our max IO depth:
|
|
3222
3863
|
var budget = self.journal.writes.available();
|
|
@@ -3225,11 +3866,34 @@ pub fn Replica(
|
|
|
3225
3866
|
return;
|
|
3226
3867
|
}
|
|
3227
3868
|
|
|
3869
|
+
if (self.op < config.journal_slot_count) {
|
|
3870
|
+
// The op is known, and this is the first WAL cycle.
|
|
3871
|
+
// Therefore, any faulty ops to the right of `replica.op` are corrupt reserved
|
|
3872
|
+
// entries from the initial format.
|
|
3873
|
+
var op: usize = self.op + 1;
|
|
3874
|
+
while (op < config.journal_slot_count) : (op += 1) {
|
|
3875
|
+
const slot = self.journal.slot_for_op(op);
|
|
3876
|
+
assert(slot.index == op);
|
|
3877
|
+
|
|
3878
|
+
if (self.journal.faulty.bit(slot)) {
|
|
3879
|
+
assert(self.journal.headers[op].command == .reserved);
|
|
3880
|
+
self.journal.dirty.clear(slot);
|
|
3881
|
+
self.journal.faulty.clear(slot);
|
|
3882
|
+
log.debug("{}: repair_prepares: op={} (op known, first cycle)", .{
|
|
3883
|
+
self.replica,
|
|
3884
|
+
op,
|
|
3885
|
+
});
|
|
3886
|
+
}
|
|
3887
|
+
}
|
|
3888
|
+
}
|
|
3889
|
+
|
|
3228
3890
|
var op = self.op + 1;
|
|
3229
|
-
|
|
3891
|
+
const op_min = op -| config.journal_slot_count;
|
|
3892
|
+
while (op > op_min) {
|
|
3230
3893
|
op -= 1;
|
|
3231
3894
|
|
|
3232
|
-
|
|
3895
|
+
const slot = self.journal.slot_for_op(op);
|
|
3896
|
+
if (self.journal.dirty.bit(slot)) {
|
|
3233
3897
|
// If this is an uncommitted op, and we are the leader in `view_change` status,
|
|
3234
3898
|
// then we will `request_prepare` from the cluster, set `nack_prepare_op`,
|
|
3235
3899
|
// and stop repairing any further prepares:
|
|
@@ -3251,7 +3915,7 @@ pub fn Replica(
|
|
|
3251
3915
|
}
|
|
3252
3916
|
}
|
|
3253
3917
|
} else {
|
|
3254
|
-
assert(!self.journal.faulty.bit(
|
|
3918
|
+
assert(!self.journal.faulty.bit(slot));
|
|
3255
3919
|
}
|
|
3256
3920
|
}
|
|
3257
3921
|
}
|
|
@@ -3273,16 +3937,17 @@ pub fn Replica(
|
|
|
3273
3937
|
/// This is effectively "many-to-one" repair, where a single replica recovers using the
|
|
3274
3938
|
/// resources of many replicas, for faster recovery.
|
|
3275
3939
|
fn repair_prepare(self: *Self, op: u64) bool {
|
|
3940
|
+
const slot = self.journal.slot_with_op(op).?;
|
|
3941
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
3942
|
+
|
|
3276
3943
|
assert(self.status == .normal or self.status == .view_change);
|
|
3277
3944
|
assert(self.repairs_allowed());
|
|
3278
|
-
assert(self.journal.dirty.bit(
|
|
3279
|
-
|
|
3280
|
-
const checksum = self.journal.entry_for_op_exact(op).?.checksum;
|
|
3945
|
+
assert(self.journal.dirty.bit(slot));
|
|
3281
3946
|
|
|
3282
3947
|
// We may be appending to or repairing the journal concurrently.
|
|
3283
3948
|
// We do not want to re-request any of these prepares unnecessarily.
|
|
3284
3949
|
if (self.journal.writing(op, checksum)) {
|
|
3285
|
-
log.debug("{}: repair_prepare:
|
|
3950
|
+
log.debug("{}: repair_prepare: op={} checksum={} (already writing)", .{
|
|
3286
3951
|
self.replica,
|
|
3287
3952
|
op,
|
|
3288
3953
|
checksum,
|
|
@@ -3290,11 +3955,46 @@ pub fn Replica(
|
|
|
3290
3955
|
return false;
|
|
3291
3956
|
}
|
|
3292
3957
|
|
|
3958
|
+
// The message may be available in the local pipeline.
|
|
3959
|
+
// For example (replica_count=3):
|
|
3960
|
+
// 1. View=1: Replica 1 is leader, and prepares op 5. The local write fails.
|
|
3961
|
+
// 2. Time passes. The view changes (e.g. due to a timeout)…
|
|
3962
|
+
// 3. View=4: Replica 1 is leader again, and is repairing op 5
|
|
3963
|
+
// (which is still in the pipeline).
|
|
3964
|
+
//
|
|
3965
|
+
// Using the pipeline to repair is faster than a `request_prepare`.
|
|
3966
|
+
// Also, messages in the pipeline are never corrupt.
|
|
3967
|
+
if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
|
|
3968
|
+
assert(prepare.message.header.op == op);
|
|
3969
|
+
assert(prepare.message.header.checksum == checksum);
|
|
3970
|
+
|
|
3971
|
+
if (self.replica_count == 1) {
|
|
3972
|
+
// This op won't start writing until all ops in the pipeline preceding it have
|
|
3973
|
+
// been written.
|
|
3974
|
+
log.debug("{}: repair_prepare: op={} checksum={} (serializing append)", .{
|
|
3975
|
+
self.replica,
|
|
3976
|
+
op,
|
|
3977
|
+
checksum,
|
|
3978
|
+
});
|
|
3979
|
+
assert(op > self.pipeline.head_ptr().?.message.header.op);
|
|
3980
|
+
return false;
|
|
3981
|
+
}
|
|
3982
|
+
|
|
3983
|
+
log.debug("{}: repair_prepare: op={} checksum={} (from pipeline)", .{
|
|
3984
|
+
self.replica,
|
|
3985
|
+
op,
|
|
3986
|
+
checksum,
|
|
3987
|
+
});
|
|
3988
|
+
self.write_prepare(prepare.message, .pipeline);
|
|
3989
|
+
return true;
|
|
3990
|
+
}
|
|
3991
|
+
|
|
3293
3992
|
const request_prepare = Header{
|
|
3294
3993
|
.command = .request_prepare,
|
|
3295
|
-
// If we request a prepare from a follower, as below, it is critical to pass a
|
|
3296
|
-
// Otherwise we could receive different prepares for the same op number.
|
|
3994
|
+
// If we request a prepare from a follower, as below, it is critical to pass a
|
|
3995
|
+
// checksum: Otherwise we could receive different prepares for the same op number.
|
|
3297
3996
|
.context = checksum,
|
|
3997
|
+
.timestamp = 1, // The checksum is included in context.
|
|
3298
3998
|
.cluster = self.cluster,
|
|
3299
3999
|
.replica = self.replica,
|
|
3300
4000
|
.view = self.view,
|
|
@@ -3305,7 +4005,7 @@ pub fn Replica(
|
|
|
3305
4005
|
// Only the leader is allowed to do repairs in a view change:
|
|
3306
4006
|
assert(self.leader_index(self.view) == self.replica);
|
|
3307
4007
|
|
|
3308
|
-
const reason = if (self.journal.faulty.bit(
|
|
4008
|
+
const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
|
|
3309
4009
|
log.debug(
|
|
3310
4010
|
"{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
|
|
3311
4011
|
.{
|
|
@@ -3316,7 +4016,7 @@ pub fn Replica(
|
|
|
3316
4016
|
},
|
|
3317
4017
|
);
|
|
3318
4018
|
|
|
3319
|
-
if (self.replica_count == 2 and !self.journal.faulty.bit(
|
|
4019
|
+
if (self.replica_count == 2 and !self.journal.faulty.bit(slot)) {
|
|
3320
4020
|
// This is required to avoid a liveness issue for a cluster-of-two where a new
|
|
3321
4021
|
// leader learns of an op during a view change but where the op is faulty on
|
|
3322
4022
|
// the old leader. We must immediately roll back the op since it could not have
|
|
@@ -3336,17 +4036,11 @@ pub fn Replica(
|
|
|
3336
4036
|
assert(nack_prepare_op <= op);
|
|
3337
4037
|
if (nack_prepare_op != op) {
|
|
3338
4038
|
self.nack_prepare_op = op;
|
|
3339
|
-
self.
|
|
3340
|
-
&self.nack_prepare_from_other_replicas,
|
|
3341
|
-
.nack_prepare,
|
|
3342
|
-
);
|
|
4039
|
+
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
3343
4040
|
}
|
|
3344
4041
|
} else {
|
|
3345
4042
|
self.nack_prepare_op = op;
|
|
3346
|
-
self.
|
|
3347
|
-
&self.nack_prepare_from_other_replicas,
|
|
3348
|
-
.nack_prepare,
|
|
3349
|
-
);
|
|
4043
|
+
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
3350
4044
|
}
|
|
3351
4045
|
|
|
3352
4046
|
assert(self.nack_prepare_op.? == op);
|
|
@@ -3354,7 +4048,7 @@ pub fn Replica(
|
|
|
3354
4048
|
self.send_header_to_other_replicas(request_prepare);
|
|
3355
4049
|
} else {
|
|
3356
4050
|
const nature = if (op > self.commit_max) "uncommitted" else "committed";
|
|
3357
|
-
const reason = if (self.journal.faulty.bit(
|
|
4051
|
+
const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
|
|
3358
4052
|
log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
|
|
3359
4053
|
self.replica,
|
|
3360
4054
|
op,
|
|
@@ -3417,22 +4111,6 @@ pub fn Replica(
|
|
|
3417
4111
|
self.send_message_to_replica(next, message);
|
|
3418
4112
|
}
|
|
3419
4113
|
|
|
3420
|
-
/// Empties the prepare pipeline, unreffing all prepare and prepare_ok messages.
|
|
3421
|
-
/// Stops the prepare timeout and resets the timeouts counter.
|
|
3422
|
-
fn reset_pipeline(self: *Self) void {
|
|
3423
|
-
while (self.pipeline.pop()) |prepare| {
|
|
3424
|
-
self.unref_prepare_message_and_quorum_messages(&prepare);
|
|
3425
|
-
}
|
|
3426
|
-
|
|
3427
|
-
self.prepare_timeout.stop();
|
|
3428
|
-
|
|
3429
|
-
assert(self.pipeline.count == 0);
|
|
3430
|
-
assert(self.prepare_timeout.ticking == false);
|
|
3431
|
-
|
|
3432
|
-
// Do not reset `repairing_pipeline` here as this must be reset by the read callback.
|
|
3433
|
-
// Otherwise, we would be making `repair_pipeline()` reentrant.
|
|
3434
|
-
}
|
|
3435
|
-
|
|
3436
4114
|
fn reset_quorum_messages(self: *Self, messages: *QuorumMessages, command: Command) void {
|
|
3437
4115
|
assert(messages.len == config.replicas_max);
|
|
3438
4116
|
var view: ?u32 = null;
|
|
@@ -3457,7 +4135,27 @@ pub fn Replica(
|
|
|
3457
4135
|
received.* = null;
|
|
3458
4136
|
}
|
|
3459
4137
|
assert(count <= self.replica_count);
|
|
3460
|
-
log.debug("{}: reset {} {s} message(s)", .{
|
|
4138
|
+
log.debug("{}: reset {} {s} message(s) from view={}", .{
|
|
4139
|
+
self.replica,
|
|
4140
|
+
count,
|
|
4141
|
+
@tagName(command),
|
|
4142
|
+
view,
|
|
4143
|
+
});
|
|
4144
|
+
}
|
|
4145
|
+
|
|
4146
|
+
fn reset_quorum_counter(self: *Self, counter: *QuorumCounter) void {
|
|
4147
|
+
var counter_iterator = counter.iterator(.{});
|
|
4148
|
+
while (counter_iterator.next()) |replica| {
|
|
4149
|
+
assert(replica < self.replica_count);
|
|
4150
|
+
}
|
|
4151
|
+
|
|
4152
|
+
counter.setIntersection(quorum_counter_null);
|
|
4153
|
+
assert(counter.count() == 0);
|
|
4154
|
+
|
|
4155
|
+
var replica: usize = 0;
|
|
4156
|
+
while (replica < self.replica_count) : (replica += 1) {
|
|
4157
|
+
assert(!counter.isSet(replica));
|
|
4158
|
+
}
|
|
3461
4159
|
}
|
|
3462
4160
|
|
|
3463
4161
|
fn reset_quorum_do_view_change(self: *Self) void {
|
|
@@ -3466,15 +4164,25 @@ pub fn Replica(
|
|
|
3466
4164
|
}
|
|
3467
4165
|
|
|
3468
4166
|
fn reset_quorum_nack_prepare(self: *Self) void {
|
|
3469
|
-
self.
|
|
4167
|
+
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
3470
4168
|
self.nack_prepare_op = null;
|
|
3471
4169
|
}
|
|
3472
4170
|
|
|
3473
4171
|
fn reset_quorum_start_view_change(self: *Self) void {
|
|
3474
|
-
self.
|
|
4172
|
+
self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
|
|
3475
4173
|
self.start_view_change_quorum = false;
|
|
3476
4174
|
}
|
|
3477
4175
|
|
|
4176
|
+
fn reset_quorum_recovery_response(self: *Self) void {
|
|
4177
|
+
for (self.recovery_response_from_other_replicas) |*received, replica| {
|
|
4178
|
+
if (received.*) |message| {
|
|
4179
|
+
assert(replica != self.replica);
|
|
4180
|
+
self.message_bus.unref(message);
|
|
4181
|
+
received.* = null;
|
|
4182
|
+
}
|
|
4183
|
+
}
|
|
4184
|
+
}
|
|
4185
|
+
|
|
3478
4186
|
fn send_prepare_ok(self: *Self, header: *const Header) void {
|
|
3479
4187
|
assert(header.command == .prepare);
|
|
3480
4188
|
assert(header.cluster == self.cluster);
|
|
@@ -3534,7 +4242,7 @@ pub fn Replica(
|
|
|
3534
4242
|
.view = self.view,
|
|
3535
4243
|
.op = header.op,
|
|
3536
4244
|
.commit = header.commit,
|
|
3537
|
-
.
|
|
4245
|
+
.timestamp = header.timestamp,
|
|
3538
4246
|
.operation = header.operation,
|
|
3539
4247
|
});
|
|
3540
4248
|
} else {
|
|
@@ -3552,7 +4260,7 @@ pub fn Replica(
|
|
|
3552
4260
|
// * being able to send what we have will allow the pipeline to commit earlier, and
|
|
3553
4261
|
// * the leader will drop any prepare_ok for a prepare not in the pipeline.
|
|
3554
4262
|
// This is safe only because the leader can verify against the prepare checksum.
|
|
3555
|
-
if (self.journal.
|
|
4263
|
+
if (self.journal.header_with_op(op)) |header| {
|
|
3556
4264
|
self.send_prepare_ok(header);
|
|
3557
4265
|
defer self.flush_loopback_queue();
|
|
3558
4266
|
}
|
|
@@ -3576,25 +4284,20 @@ pub fn Replica(
|
|
|
3576
4284
|
assert(self.status == .view_change);
|
|
3577
4285
|
assert(self.start_view_change_quorum);
|
|
3578
4286
|
assert(!self.do_view_change_quorum);
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
.start_view_change,
|
|
3582
|
-
0,
|
|
3583
|
-
);
|
|
4287
|
+
|
|
4288
|
+
const count_start_view_change = self.start_view_change_from_other_replicas.count();
|
|
3584
4289
|
assert(count_start_view_change >= self.quorum_view_change - 1);
|
|
4290
|
+
assert(count_start_view_change <= self.replica_count - 1);
|
|
3585
4291
|
|
|
3586
|
-
const message = self.create_view_change_message(.do_view_change)
|
|
3587
|
-
log.err("{}: send_do_view_change: waiting for message", .{self.replica});
|
|
3588
|
-
return;
|
|
3589
|
-
};
|
|
4292
|
+
const message = self.create_view_change_message(.do_view_change);
|
|
3590
4293
|
defer self.message_bus.unref(message);
|
|
3591
4294
|
|
|
3592
4295
|
assert(message.references == 1);
|
|
3593
4296
|
assert(message.header.command == .do_view_change);
|
|
3594
4297
|
assert(message.header.view == self.view);
|
|
3595
4298
|
assert(message.header.op == self.op);
|
|
4299
|
+
assert(message.header.op == self.message_body_as_headers(message)[0].op);
|
|
3596
4300
|
assert(message.header.commit == self.commit_max);
|
|
3597
|
-
// TODO Assert that latest header in message body matches self.op.
|
|
3598
4301
|
|
|
3599
4302
|
self.send_message_to_replica(self.leader_index(self.view), message);
|
|
3600
4303
|
}
|
|
@@ -3618,25 +4321,14 @@ pub fn Replica(
|
|
|
3618
4321
|
}
|
|
3619
4322
|
|
|
3620
4323
|
fn send_header_to_client(self: *Self, client: u128, header: Header) void {
|
|
3621
|
-
const message = self.create_message_from_header(header)
|
|
3622
|
-
log.err("{}: no header-only message available, dropping message to client {}", .{
|
|
3623
|
-
self.replica,
|
|
3624
|
-
client,
|
|
3625
|
-
});
|
|
3626
|
-
return;
|
|
3627
|
-
};
|
|
4324
|
+
const message = self.create_message_from_header(header);
|
|
3628
4325
|
defer self.message_bus.unref(message);
|
|
3629
4326
|
|
|
3630
4327
|
self.message_bus.send_message_to_client(client, message);
|
|
3631
4328
|
}
|
|
3632
4329
|
|
|
3633
4330
|
fn send_header_to_other_replicas(self: *Self, header: Header) void {
|
|
3634
|
-
const message = self.create_message_from_header(header)
|
|
3635
|
-
log.err("{}: no header-only message available, dropping message to replicas", .{
|
|
3636
|
-
self.replica,
|
|
3637
|
-
});
|
|
3638
|
-
return;
|
|
3639
|
-
};
|
|
4331
|
+
const message = self.create_message_from_header(header);
|
|
3640
4332
|
defer self.message_bus.unref(message);
|
|
3641
4333
|
|
|
3642
4334
|
var replica: u8 = 0;
|
|
@@ -3648,13 +4340,7 @@ pub fn Replica(
|
|
|
3648
4340
|
}
|
|
3649
4341
|
|
|
3650
4342
|
fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
|
|
3651
|
-
const message = self.create_message_from_header(header)
|
|
3652
|
-
log.err("{}: no header-only message available, dropping message to replica {}", .{
|
|
3653
|
-
self.replica,
|
|
3654
|
-
replica,
|
|
3655
|
-
});
|
|
3656
|
-
return;
|
|
3657
|
-
};
|
|
4343
|
+
const message = self.create_message_from_header(header);
|
|
3658
4344
|
defer self.message_bus.unref(message);
|
|
3659
4345
|
|
|
3660
4346
|
self.send_message_to_replica(replica, message);
|
|
@@ -3686,6 +4372,7 @@ pub fn Replica(
|
|
|
3686
4372
|
|
|
3687
4373
|
// TODO According to message.header.command, assert on the destination replica.
|
|
3688
4374
|
switch (message.header.command) {
|
|
4375
|
+
.reserved => unreachable,
|
|
3689
4376
|
.request => {
|
|
3690
4377
|
// Do not assert message.header.replica because we forward .request messages.
|
|
3691
4378
|
assert(self.status == .normal);
|
|
@@ -3738,6 +4425,16 @@ pub fn Replica(
|
|
|
3738
4425
|
},
|
|
3739
4426
|
else => unreachable,
|
|
3740
4427
|
},
|
|
4428
|
+
.recovery => {
|
|
4429
|
+
assert(self.status == .recovering);
|
|
4430
|
+
assert(message.header.replica == self.replica);
|
|
4431
|
+
assert(message.header.context == self.recovery_nonce);
|
|
4432
|
+
},
|
|
4433
|
+
.recovery_response => {
|
|
4434
|
+
assert(self.status == .normal);
|
|
4435
|
+
assert(message.header.view == self.view);
|
|
4436
|
+
assert(message.header.replica == self.replica);
|
|
4437
|
+
},
|
|
3741
4438
|
.headers => {
|
|
3742
4439
|
assert(self.status == .normal or self.status == .view_change);
|
|
3743
4440
|
assert(message.header.view == self.view);
|
|
@@ -3764,7 +4461,7 @@ pub fn Replica(
|
|
|
3764
4461
|
.nack_prepare => {
|
|
3765
4462
|
assert(message.header.view == self.view);
|
|
3766
4463
|
assert(message.header.replica == self.replica);
|
|
3767
|
-
assert(
|
|
4464
|
+
assert(self.leader_index(self.view) == replica);
|
|
3768
4465
|
},
|
|
3769
4466
|
else => {
|
|
3770
4467
|
log.info("{}: send_message_to_replica: TODO {s}", .{
|
|
@@ -3783,8 +4480,8 @@ pub fn Replica(
|
|
|
3783
4480
|
}
|
|
3784
4481
|
|
|
3785
4482
|
/// Finds the header with the highest op number in a slice of headers from a replica.
|
|
3786
|
-
/// Searches only by op number to find the highest `self.op for the replica.
|
|
3787
|
-
fn set_latest_op(headers: []Header, latest: *Header) void {
|
|
4483
|
+
/// Searches only by op number to find the highest `self.op` for the replica.
|
|
4484
|
+
fn set_latest_op(headers: []const Header, latest: *Header) void {
|
|
3788
4485
|
switch (latest.command) {
|
|
3789
4486
|
.reserved, .prepare => assert(latest.valid_checksum()),
|
|
3790
4487
|
else => unreachable,
|
|
@@ -3809,17 +4506,27 @@ pub fn Replica(
|
|
|
3809
4506
|
k: u64,
|
|
3810
4507
|
method: []const u8,
|
|
3811
4508
|
) void {
|
|
3812
|
-
assert(self.status == .view_change);
|
|
3813
|
-
|
|
4509
|
+
assert(self.status == .view_change or self.status == .recovering);
|
|
4510
|
+
assert(self.journal.recovered);
|
|
3814
4511
|
assert(latest.valid_checksum());
|
|
3815
4512
|
assert(latest.invalid() == null);
|
|
3816
4513
|
assert(latest.command == .prepare);
|
|
3817
4514
|
assert(latest.cluster == self.cluster);
|
|
3818
4515
|
|
|
3819
|
-
|
|
3820
|
-
|
|
4516
|
+
switch (self.status) {
|
|
4517
|
+
.normal => unreachable,
|
|
4518
|
+
.view_change => {
|
|
4519
|
+
// The view may have started already, so we can have a prepare in the same view:
|
|
4520
|
+
assert(latest.view <= self.view);
|
|
4521
|
+
},
|
|
4522
|
+
.recovering => {
|
|
4523
|
+
// The replica's view hasn't been set yet.
|
|
4524
|
+
// It will be set shortly, when we transition to normal status.
|
|
4525
|
+
assert(self.view == 0);
|
|
4526
|
+
},
|
|
4527
|
+
}
|
|
3821
4528
|
|
|
3822
|
-
log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}
|
|
4529
|
+
log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}", .{
|
|
3823
4530
|
self.replica,
|
|
3824
4531
|
method,
|
|
3825
4532
|
self.view,
|
|
@@ -3828,7 +4535,6 @@ pub fn Replica(
|
|
|
3828
4535
|
self.commit_max,
|
|
3829
4536
|
k,
|
|
3830
4537
|
latest.checksum,
|
|
3831
|
-
latest.offset,
|
|
3832
4538
|
});
|
|
3833
4539
|
|
|
3834
4540
|
// Uncommitted ops may not survive a view change so we must assert `latest.op` against
|
|
@@ -3854,7 +4560,7 @@ pub fn Replica(
|
|
|
3854
4560
|
});
|
|
3855
4561
|
}
|
|
3856
4562
|
assert(k >= latest.commit);
|
|
3857
|
-
assert(k >= self.commit_max - std.math.min(config.
|
|
4563
|
+
assert(k >= self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
|
|
3858
4564
|
|
|
3859
4565
|
assert(self.commit_min <= self.commit_max);
|
|
3860
4566
|
assert(self.op >= self.commit_max or self.op < self.commit_max);
|
|
@@ -3870,15 +4576,15 @@ pub fn Replica(
|
|
|
3870
4576
|
// Do not set the latest op as dirty if we already have it exactly:
|
|
3871
4577
|
// Otherwise, this would trigger a repair and delay the view change, or worse, it would
|
|
3872
4578
|
// prevent us from assisting another replica to recover when we do in fact have the op.
|
|
3873
|
-
if (self.journal.
|
|
4579
|
+
if (self.journal.has(latest)) {
|
|
3874
4580
|
log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
|
|
3875
4581
|
} else {
|
|
3876
|
-
self.journal.
|
|
4582
|
+
self.journal.set_header_as_dirty(latest);
|
|
3877
4583
|
}
|
|
3878
4584
|
|
|
3879
4585
|
assert(self.op == latest.op);
|
|
3880
4586
|
self.journal.remove_entries_from(self.op + 1);
|
|
3881
|
-
assert(self.journal.
|
|
4587
|
+
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
3882
4588
|
}
|
|
3883
4589
|
|
|
3884
4590
|
fn start_view_as_the_new_leader(self: *Self) void {
|
|
@@ -3891,34 +4597,18 @@ pub fn Replica(
|
|
|
3891
4597
|
|
|
3892
4598
|
assert(self.commit_min == self.commit_max);
|
|
3893
4599
|
assert(self.repair_pipeline_op() == null);
|
|
4600
|
+
self.verify_pipeline();
|
|
3894
4601
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
3895
4602
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
3896
4603
|
|
|
3897
|
-
|
|
3898
|
-
|
|
3899
|
-
var iterator = self.pipeline.iterator();
|
|
3900
|
-
while (iterator.next_ptr()) |prepare| {
|
|
3901
|
-
assert(prepare.message.header.command == .prepare);
|
|
3902
|
-
assert(prepare.message.header.op == pipeline_op);
|
|
3903
|
-
assert(prepare.message.header.parent == pipeline_parent);
|
|
3904
|
-
|
|
3905
|
-
pipeline_parent = prepare.message.header.checksum;
|
|
3906
|
-
pipeline_op += 1;
|
|
3907
|
-
}
|
|
3908
|
-
assert(self.pipeline.count <= config.pipelining_max);
|
|
3909
|
-
assert(self.commit_max + self.pipeline.count == pipeline_op - 1);
|
|
3910
|
-
|
|
3911
|
-
assert(self.journal.dirty.len == 0);
|
|
3912
|
-
assert(self.journal.faulty.len == 0);
|
|
4604
|
+
assert(self.journal.dirty.count == 0);
|
|
4605
|
+
assert(self.journal.faulty.count == 0);
|
|
3913
4606
|
assert(self.nack_prepare_op == null);
|
|
3914
4607
|
|
|
3915
|
-
const start_view = self.create_view_change_message(.start_view)
|
|
3916
|
-
log.err("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
|
|
3917
|
-
return;
|
|
3918
|
-
};
|
|
4608
|
+
const start_view = self.create_view_change_message(.start_view);
|
|
3919
4609
|
defer self.message_bus.unref(start_view);
|
|
3920
4610
|
|
|
3921
|
-
self.
|
|
4611
|
+
self.transition_to_normal_from_view_change_status(self.view);
|
|
3922
4612
|
// Detect if the transition to normal status above accidentally resets the pipeline:
|
|
3923
4613
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
3924
4614
|
|
|
@@ -3937,17 +4627,73 @@ pub fn Replica(
|
|
|
3937
4627
|
self.send_message_to_other_replicas(start_view);
|
|
3938
4628
|
}
|
|
3939
4629
|
|
|
3940
|
-
fn
|
|
3941
|
-
|
|
4630
|
+
fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
|
|
4631
|
+
assert(self.status == .recovering);
|
|
4632
|
+
assert(self.view == 0);
|
|
4633
|
+
self.view = new_view;
|
|
4634
|
+
self.view_normal = new_view;
|
|
4635
|
+
self.status = .normal;
|
|
4636
|
+
|
|
4637
|
+
if (self.leader()) {
|
|
4638
|
+
log.debug(
|
|
4639
|
+
"{}: transition_to_normal_from_recovering_status: view={} leader",
|
|
4640
|
+
.{
|
|
4641
|
+
self.replica,
|
|
4642
|
+
self.view,
|
|
4643
|
+
},
|
|
4644
|
+
);
|
|
4645
|
+
|
|
4646
|
+
assert(self.journal.is_empty() or self.replica_count == 1);
|
|
4647
|
+
assert(!self.prepare_timeout.ticking);
|
|
4648
|
+
assert(!self.normal_status_timeout.ticking);
|
|
4649
|
+
assert(!self.view_change_status_timeout.ticking);
|
|
4650
|
+
assert(!self.view_change_message_timeout.ticking);
|
|
4651
|
+
|
|
4652
|
+
self.ping_timeout.start();
|
|
4653
|
+
self.commit_timeout.start();
|
|
4654
|
+
self.repair_timeout.start();
|
|
4655
|
+
self.recovery_timeout.stop();
|
|
4656
|
+
} else {
|
|
4657
|
+
log.debug(
|
|
4658
|
+
"{}: transition_to_normal_from_recovering_status: view={} follower",
|
|
4659
|
+
.{
|
|
4660
|
+
self.replica,
|
|
4661
|
+
self.view,
|
|
4662
|
+
},
|
|
4663
|
+
);
|
|
4664
|
+
|
|
4665
|
+
assert(!self.prepare_timeout.ticking);
|
|
4666
|
+
assert(!self.commit_timeout.ticking);
|
|
4667
|
+
assert(!self.view_change_status_timeout.ticking);
|
|
4668
|
+
assert(!self.view_change_message_timeout.ticking);
|
|
4669
|
+
|
|
4670
|
+
self.ping_timeout.start();
|
|
4671
|
+
self.normal_status_timeout.start();
|
|
4672
|
+
self.repair_timeout.start();
|
|
4673
|
+
self.recovery_timeout.stop();
|
|
4674
|
+
}
|
|
4675
|
+
}
|
|
4676
|
+
|
|
4677
|
+
fn transition_to_normal_from_view_change_status(self: *Self, new_view: u32) void {
|
|
3942
4678
|
// In the VRR paper it's possible to transition from normal to normal for the same view.
|
|
3943
4679
|
// For example, this could happen after a state transfer triggered by an op jump.
|
|
4680
|
+
assert(self.status == .view_change);
|
|
3944
4681
|
assert(new_view >= self.view);
|
|
3945
4682
|
self.view = new_view;
|
|
3946
4683
|
self.view_normal = new_view;
|
|
3947
4684
|
self.status = .normal;
|
|
3948
4685
|
|
|
3949
4686
|
if (self.leader()) {
|
|
3950
|
-
log.debug(
|
|
4687
|
+
log.debug(
|
|
4688
|
+
"{}: transition_to_normal_from_view_change_status: view={} leader",
|
|
4689
|
+
.{
|
|
4690
|
+
self.replica,
|
|
4691
|
+
self.view,
|
|
4692
|
+
},
|
|
4693
|
+
);
|
|
4694
|
+
|
|
4695
|
+
assert(!self.prepare_timeout.ticking);
|
|
4696
|
+
assert(!self.recovery_timeout.ticking);
|
|
3951
4697
|
|
|
3952
4698
|
self.ping_timeout.start();
|
|
3953
4699
|
self.commit_timeout.start();
|
|
@@ -3957,12 +4703,15 @@ pub fn Replica(
|
|
|
3957
4703
|
self.repair_timeout.start();
|
|
3958
4704
|
|
|
3959
4705
|
// Do not reset the pipeline as there may be uncommitted ops to drive to completion.
|
|
3960
|
-
if (self.pipeline.count > 0)
|
|
3961
|
-
assert(!self.prepare_timeout.ticking);
|
|
3962
|
-
self.prepare_timeout.start();
|
|
3963
|
-
}
|
|
4706
|
+
if (self.pipeline.count > 0) self.prepare_timeout.start();
|
|
3964
4707
|
} else {
|
|
3965
|
-
log.debug("{}:
|
|
4708
|
+
log.debug("{}: transition_to_normal_from_view_change_status: view={} follower", .{
|
|
4709
|
+
self.replica,
|
|
4710
|
+
self.view,
|
|
4711
|
+
});
|
|
4712
|
+
|
|
4713
|
+
assert(!self.prepare_timeout.ticking);
|
|
4714
|
+
assert(!self.recovery_timeout.ticking);
|
|
3966
4715
|
|
|
3967
4716
|
self.ping_timeout.start();
|
|
3968
4717
|
self.commit_timeout.stop();
|
|
@@ -3970,8 +4719,6 @@ pub fn Replica(
|
|
|
3970
4719
|
self.view_change_status_timeout.stop();
|
|
3971
4720
|
self.view_change_message_timeout.stop();
|
|
3972
4721
|
self.repair_timeout.start();
|
|
3973
|
-
|
|
3974
|
-
self.reset_pipeline();
|
|
3975
4722
|
}
|
|
3976
4723
|
|
|
3977
4724
|
self.reset_quorum_start_view_change();
|
|
@@ -3983,17 +4730,18 @@ pub fn Replica(
|
|
|
3983
4730
|
assert(self.nack_prepare_op == null);
|
|
3984
4731
|
}
|
|
3985
4732
|
|
|
3986
|
-
/// A replica i that notices the need for a view change advances its view, sets its status
|
|
3987
|
-
/// view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
|
|
3988
|
-
/// where v identifies the new view. A replica notices the need for a view change either
|
|
3989
|
-
/// on its own timer, or because it receives a start_view_change or do_view_change
|
|
3990
|
-
/// a view with a larger number than its own view.
|
|
4733
|
+
/// A replica i that notices the need for a view change advances its view, sets its status
|
|
4734
|
+
/// to view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
|
|
4735
|
+
/// where v identifies the new view. A replica notices the need for a view change either
|
|
4736
|
+
/// based on its own timer, or because it receives a start_view_change or do_view_change
|
|
4737
|
+
/// message for a view with a larger number than its own view.
|
|
3991
4738
|
fn transition_to_view_change_status(self: *Self, new_view: u32) void {
|
|
3992
4739
|
log.debug("{}: transition_to_view_change_status: view={}..{}", .{
|
|
3993
4740
|
self.replica,
|
|
3994
4741
|
self.view,
|
|
3995
4742
|
new_view,
|
|
3996
4743
|
});
|
|
4744
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
3997
4745
|
assert(new_view > self.view);
|
|
3998
4746
|
self.view = new_view;
|
|
3999
4747
|
self.status = .view_change;
|
|
@@ -4004,13 +4752,14 @@ pub fn Replica(
|
|
|
4004
4752
|
self.view_change_status_timeout.start();
|
|
4005
4753
|
self.view_change_message_timeout.start();
|
|
4006
4754
|
self.repair_timeout.stop();
|
|
4755
|
+
self.prepare_timeout.stop();
|
|
4756
|
+
assert(!self.recovery_timeout.ticking);
|
|
4007
4757
|
|
|
4008
4758
|
// Do not reset quorum counters only on entering a view, assuming that the view will be
|
|
4009
4759
|
// followed only by a single subsequent view change to the next view, because multiple
|
|
4010
4760
|
// successive view changes can fail, e.g. after a view change timeout.
|
|
4011
|
-
// We must therefore reset our counters here to avoid counting messages from an older
|
|
4012
|
-
// which would violate the quorum intersection property essential for correctness.
|
|
4013
|
-
self.reset_pipeline();
|
|
4761
|
+
// We must therefore reset our counters here to avoid counting messages from an older
|
|
4762
|
+
// view, which would violate the quorum intersection property essential for correctness.
|
|
4014
4763
|
self.reset_quorum_start_view_change();
|
|
4015
4764
|
self.reset_quorum_do_view_change();
|
|
4016
4765
|
self.reset_quorum_nack_prepare();
|
|
@@ -4022,19 +4771,6 @@ pub fn Replica(
|
|
|
4022
4771
|
self.send_start_view_change();
|
|
4023
4772
|
}
|
|
4024
4773
|
|
|
4025
|
-
fn unref_prepare_message_and_quorum_messages(
|
|
4026
|
-
self: *Self,
|
|
4027
|
-
prepare: *const Prepare,
|
|
4028
|
-
) void {
|
|
4029
|
-
self.message_bus.unref(prepare.message);
|
|
4030
|
-
for (prepare.ok_from_all_replicas) |received, replica| {
|
|
4031
|
-
if (received) |prepare_ok| {
|
|
4032
|
-
assert(replica < self.replica_count);
|
|
4033
|
-
self.message_bus.unref(prepare_ok);
|
|
4034
|
-
}
|
|
4035
|
-
}
|
|
4036
|
-
}
|
|
4037
|
-
|
|
4038
4774
|
fn update_client_table_entry(self: *Self, reply: *Message) void {
|
|
4039
4775
|
assert(reply.header.command == .reply);
|
|
4040
4776
|
assert(reply.header.operation != .register);
|
|
@@ -4098,21 +4834,21 @@ pub fn Replica(
|
|
|
4098
4834
|
return true;
|
|
4099
4835
|
}
|
|
4100
4836
|
|
|
4101
|
-
/// Returns true if all operations are present, correctly ordered and connected by hash
|
|
4102
|
-
/// between `op_min` and `op_max` (both inclusive).
|
|
4837
|
+
/// Returns true if all operations are present, correctly ordered and connected by hash
|
|
4838
|
+
/// chain, between `op_min` and `op_max` (both inclusive).
|
|
4103
4839
|
fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
|
|
4104
4840
|
assert(op_min <= op_max);
|
|
4105
4841
|
|
|
4106
|
-
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
4107
|
-
// have since been reordered by a new leader.
|
|
4842
|
+
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
4843
|
+
// that have since been reordered by a new leader.
|
|
4108
4844
|
assert(op_max == self.op);
|
|
4109
|
-
var b = self.journal.
|
|
4845
|
+
var b = self.journal.header_with_op(op_max).?;
|
|
4110
4846
|
|
|
4111
4847
|
var op = op_max;
|
|
4112
4848
|
while (op > op_min) {
|
|
4113
4849
|
op -= 1;
|
|
4114
4850
|
|
|
4115
|
-
if (self.journal.
|
|
4851
|
+
if (self.journal.header_with_op(op)) |a| {
|
|
4116
4852
|
assert(a.op + 1 == b.op);
|
|
4117
4853
|
if (a.checksum == b.parent) {
|
|
4118
4854
|
assert(ascending_viewstamps(a, b));
|
|
@@ -4131,6 +4867,33 @@ pub fn Replica(
|
|
|
4131
4867
|
return true;
|
|
4132
4868
|
}
|
|
4133
4869
|
|
|
4870
|
+
fn verify_pipeline(self: *Self) void {
|
|
4871
|
+
var op = self.commit_max + 1;
|
|
4872
|
+
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
4873
|
+
|
|
4874
|
+
var iterator = self.pipeline.iterator();
|
|
4875
|
+
while (iterator.next_ptr()) |prepare| {
|
|
4876
|
+
assert(prepare.message.header.command == .prepare);
|
|
4877
|
+
|
|
4878
|
+
log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
|
|
4879
|
+
self.replica,
|
|
4880
|
+
prepare.message.header.op,
|
|
4881
|
+
prepare.message.header.checksum,
|
|
4882
|
+
prepare.message.header.parent,
|
|
4883
|
+
});
|
|
4884
|
+
|
|
4885
|
+
assert(self.journal.has(prepare.message.header));
|
|
4886
|
+
assert(prepare.message.header.op == op);
|
|
4887
|
+
assert(prepare.message.header.op <= self.op);
|
|
4888
|
+
assert(prepare.message.header.parent == parent);
|
|
4889
|
+
|
|
4890
|
+
parent = prepare.message.header.checksum;
|
|
4891
|
+
op += 1;
|
|
4892
|
+
}
|
|
4893
|
+
assert(self.pipeline.count <= config.pipeline_max);
|
|
4894
|
+
assert(self.commit_max + self.pipeline.count == op - 1);
|
|
4895
|
+
}
|
|
4896
|
+
|
|
4134
4897
|
fn view_jump(self: *Self, header: *const Header) void {
|
|
4135
4898
|
const to: Status = switch (header.command) {
|
|
4136
4899
|
.prepare, .commit => .normal,
|
|
@@ -4226,10 +4989,10 @@ pub fn Replica(
|
|
|
4226
4989
|
return;
|
|
4227
4990
|
}
|
|
4228
4991
|
|
|
4229
|
-
self.journal.write_prepare(
|
|
4992
|
+
self.journal.write_prepare(write_prepare_callback, message, trigger);
|
|
4230
4993
|
}
|
|
4231
4994
|
|
|
4232
|
-
fn
|
|
4995
|
+
fn write_prepare_callback(
|
|
4233
4996
|
self: *Self,
|
|
4234
4997
|
wrote: ?*Message,
|
|
4235
4998
|
trigger: Journal.Write.Trigger,
|
|
@@ -4245,6 +5008,7 @@ pub fn Replica(
|
|
|
4245
5008
|
// If this was a repair, continue immediately to repair the next prepare:
|
|
4246
5009
|
// This is an optimization to eliminate waiting until the next repair timeout.
|
|
4247
5010
|
.repair => self.repair(),
|
|
5011
|
+
.pipeline => self.repair(),
|
|
4248
5012
|
}
|
|
4249
5013
|
}
|
|
4250
5014
|
};
|