tigerbeetle-node 0.4.1 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -5
- package/dist/benchmark.js.map +1 -1
- package/dist/index.d.ts +18 -16
- package/dist/index.js +35 -13
- package/dist/index.js.map +1 -1
- package/dist/test.js +13 -1
- package/dist/test.js.map +1 -1
- package/package.json +12 -12
- package/scripts/postinstall.sh +2 -2
- package/src/benchmark.ts +2 -2
- package/src/index.ts +29 -4
- package/src/node.zig +124 -21
- package/src/test.ts +18 -4
- package/src/tigerbeetle/scripts/install.sh +2 -2
- package/src/tigerbeetle/scripts/install_zig.bat +109 -0
- package/src/tigerbeetle/scripts/install_zig.sh +22 -3
- package/src/tigerbeetle/scripts/lint.zig +8 -2
- package/src/tigerbeetle/scripts/vopr.bat +48 -0
- package/src/tigerbeetle/scripts/vopr.sh +24 -4
- package/src/tigerbeetle/src/benchmark.zig +18 -14
- package/src/tigerbeetle/src/cli.zig +8 -6
- package/src/tigerbeetle/src/config.zig +10 -18
- package/src/tigerbeetle/src/demo.zig +122 -92
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +5 -3
- package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +2 -3
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
- package/src/tigerbeetle/src/demo_04_create_transfers_two_phase_commit.zig +5 -3
- package/src/tigerbeetle/src/demo_05_accept_transfers.zig +5 -3
- package/src/tigerbeetle/src/demo_06_reject_transfers.zig +5 -3
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +7 -0
- package/src/tigerbeetle/src/fifo.zig +14 -14
- package/src/tigerbeetle/src/io/benchmark.zig +238 -0
- package/src/tigerbeetle/src/{io_darwin.zig → io/darwin.zig} +88 -121
- package/src/tigerbeetle/src/io/linux.zig +933 -0
- package/src/tigerbeetle/src/io/test.zig +621 -0
- package/src/tigerbeetle/src/io.zig +7 -1322
- package/src/tigerbeetle/src/main.zig +22 -13
- package/src/tigerbeetle/src/message_bus.zig +50 -61
- package/src/tigerbeetle/src/message_pool.zig +6 -5
- package/src/tigerbeetle/src/ring_buffer.zig +135 -68
- package/src/tigerbeetle/src/simulator.zig +120 -47
- package/src/tigerbeetle/src/state_machine.zig +853 -27
- package/src/tigerbeetle/src/storage.zig +51 -48
- package/src/tigerbeetle/src/test/cluster.zig +90 -14
- package/src/tigerbeetle/src/test/message_bus.zig +7 -10
- package/src/tigerbeetle/src/test/network.zig +5 -5
- package/src/tigerbeetle/src/test/packet_simulator.zig +188 -32
- package/src/tigerbeetle/src/test/state_checker.zig +3 -3
- package/src/tigerbeetle/src/test/state_machine.zig +6 -4
- package/src/tigerbeetle/src/test/storage.zig +322 -26
- package/src/tigerbeetle/src/test/time.zig +2 -2
- package/src/tigerbeetle/src/tigerbeetle.zig +6 -129
- package/src/tigerbeetle/src/time.zig +6 -5
- package/src/tigerbeetle/src/unit_tests.zig +14 -0
- package/src/tigerbeetle/src/{vr → vsr}/client.zig +21 -21
- package/src/tigerbeetle/src/{vr → vsr}/clock.zig +34 -48
- package/src/tigerbeetle/src/{vr → vsr}/journal.zig +259 -61
- package/src/tigerbeetle/src/{marzullo.zig → vsr/marzullo.zig} +6 -3
- package/src/tigerbeetle/src/{vr → vsr}/replica.zig +711 -349
- package/src/tigerbeetle/src/{vr.zig → vsr.zig} +32 -25
- package/src/translate.zig +55 -55
- package/src/tigerbeetle/src/fixed_array_list.zig +0 -53
- package/src/tigerbeetle/src/io_async.zig +0 -600
- package/src/tigerbeetle/src/test_client.zig +0 -41
|
@@ -7,11 +7,11 @@ const config = @import("../config.zig");
|
|
|
7
7
|
const Message = @import("../message_pool.zig").MessagePool.Message;
|
|
8
8
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
9
9
|
|
|
10
|
-
const
|
|
11
|
-
const Header =
|
|
12
|
-
const Timeout =
|
|
13
|
-
const Command =
|
|
14
|
-
const Version =
|
|
10
|
+
const vsr = @import("../vsr.zig");
|
|
11
|
+
const Header = vsr.Header;
|
|
12
|
+
const Timeout = vsr.Timeout;
|
|
13
|
+
const Command = vsr.Command;
|
|
14
|
+
const Version = vsr.Version;
|
|
15
15
|
|
|
16
16
|
const log = std.log.scoped(.replica);
|
|
17
17
|
|
|
@@ -21,7 +21,7 @@ pub const Status = enum {
|
|
|
21
21
|
recovering,
|
|
22
22
|
};
|
|
23
23
|
|
|
24
|
-
const ClientTable = std.
|
|
24
|
+
const ClientTable = std.AutoHashMapUnmanaged(u128, ClientTableEntry);
|
|
25
25
|
|
|
26
26
|
/// We found two bugs in the VRR paper relating to the client table:
|
|
27
27
|
///
|
|
@@ -70,10 +70,8 @@ pub fn Replica(
|
|
|
70
70
|
return struct {
|
|
71
71
|
const Self = @This();
|
|
72
72
|
|
|
73
|
-
const Journal =
|
|
74
|
-
const Clock =
|
|
75
|
-
|
|
76
|
-
allocator: *Allocator,
|
|
73
|
+
const Journal = vsr.Journal(Self, Storage);
|
|
74
|
+
const Clock = vsr.Clock(Time);
|
|
77
75
|
|
|
78
76
|
/// The number of the cluster to which this replica belongs:
|
|
79
77
|
cluster: u32,
|
|
@@ -109,13 +107,8 @@ pub fn Replica(
|
|
|
109
107
|
/// The current view, initially 0:
|
|
110
108
|
view: u32,
|
|
111
109
|
|
|
112
|
-
///
|
|
113
|
-
|
|
114
|
-
/// committing to avoid committing ops that may have been changed through a view change.
|
|
115
|
-
/// This is the most crucial aspect of the protocol to get right, especially because it can
|
|
116
|
-
/// slip past any protection provided by the hash chain. For example, we may have a fully
|
|
117
|
-
/// connected hash chain but with uncommitted ops that never survived into the newer view.
|
|
118
|
-
view_jump_barrier: bool = false,
|
|
110
|
+
/// The latest view, in which the replica's status was normal.
|
|
111
|
+
view_normal: u32,
|
|
119
112
|
|
|
120
113
|
/// The current status, either normal, view_change, or recovering:
|
|
121
114
|
/// TODO Don't default to normal, set the starting status according to the journal's health.
|
|
@@ -181,11 +174,11 @@ pub fn Replica(
|
|
|
181
174
|
|
|
182
175
|
/// The number of ticks without hearing from the leader before starting a view change.
|
|
183
176
|
/// This transitions from .normal status to .view_change status.
|
|
184
|
-
|
|
177
|
+
normal_status_timeout: Timeout,
|
|
185
178
|
|
|
186
179
|
/// The number of ticks before a view change is timed out:
|
|
187
180
|
/// This transitions from `view_change` status to `view_change` status but for a newer view.
|
|
188
|
-
|
|
181
|
+
view_change_status_timeout: Timeout,
|
|
189
182
|
|
|
190
183
|
/// The number of ticks before resending a `start_view_change` or `do_view_change` message:
|
|
191
184
|
view_change_message_timeout: Timeout,
|
|
@@ -204,11 +197,11 @@ pub fn Replica(
|
|
|
204
197
|
on_change_state: ?fn (replica: *Self) void = null,
|
|
205
198
|
|
|
206
199
|
pub fn init(
|
|
207
|
-
allocator:
|
|
200
|
+
allocator: Allocator,
|
|
208
201
|
cluster: u32,
|
|
209
202
|
replica_count: u8,
|
|
210
203
|
replica: u8,
|
|
211
|
-
time: Time,
|
|
204
|
+
time: *Time,
|
|
212
205
|
storage: *Storage,
|
|
213
206
|
message_bus: *MessageBus,
|
|
214
207
|
state_machine: *StateMachine,
|
|
@@ -236,14 +229,17 @@ pub fn Replica(
|
|
|
236
229
|
if (replica_count <= 2) {
|
|
237
230
|
assert(quorum_replication == replica_count);
|
|
238
231
|
assert(quorum_view_change == replica_count);
|
|
232
|
+
} else {
|
|
233
|
+
assert(quorum_replication < replica_count);
|
|
234
|
+
assert(quorum_view_change < replica_count);
|
|
239
235
|
}
|
|
240
236
|
|
|
241
237
|
// Flexible quorums are safe if these two quorums intersect so that this relation holds:
|
|
242
238
|
assert(quorum_replication + quorum_view_change > replica_count);
|
|
243
239
|
|
|
244
|
-
var client_table =
|
|
245
|
-
errdefer client_table.deinit();
|
|
246
|
-
try client_table.
|
|
240
|
+
var client_table: ClientTable = .{};
|
|
241
|
+
errdefer client_table.deinit(allocator);
|
|
242
|
+
try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
|
|
247
243
|
assert(client_table.capacity() >= config.clients_max);
|
|
248
244
|
|
|
249
245
|
var init_prepare = Header{
|
|
@@ -267,7 +263,6 @@ pub fn Replica(
|
|
|
267
263
|
init_prepare.set_checksum();
|
|
268
264
|
|
|
269
265
|
var self = Self{
|
|
270
|
-
.allocator = allocator,
|
|
271
266
|
.cluster = cluster,
|
|
272
267
|
.replica_count = replica_count,
|
|
273
268
|
.replica = replica,
|
|
@@ -291,6 +286,7 @@ pub fn Replica(
|
|
|
291
286
|
.state_machine = state_machine,
|
|
292
287
|
.client_table = client_table,
|
|
293
288
|
.view = init_prepare.view,
|
|
289
|
+
.view_normal = init_prepare.view,
|
|
294
290
|
.op = init_prepare.op,
|
|
295
291
|
.commit_min = init_prepare.commit,
|
|
296
292
|
.commit_max = init_prepare.commit,
|
|
@@ -309,13 +305,13 @@ pub fn Replica(
|
|
|
309
305
|
.id = replica,
|
|
310
306
|
.after = 100,
|
|
311
307
|
},
|
|
312
|
-
.
|
|
313
|
-
.name = "
|
|
308
|
+
.normal_status_timeout = Timeout{
|
|
309
|
+
.name = "normal_status_timeout",
|
|
314
310
|
.id = replica,
|
|
315
311
|
.after = 500,
|
|
316
312
|
},
|
|
317
|
-
.
|
|
318
|
-
.name = "
|
|
313
|
+
.view_change_status_timeout = Timeout{
|
|
314
|
+
.name = "view_change_status_timeout",
|
|
319
315
|
.id = replica,
|
|
320
316
|
.after = 500,
|
|
321
317
|
},
|
|
@@ -358,15 +354,52 @@ pub fn Replica(
|
|
|
358
354
|
} else {
|
|
359
355
|
log.debug("{}: init: follower", .{self.replica});
|
|
360
356
|
self.ping_timeout.start();
|
|
361
|
-
self.
|
|
357
|
+
self.normal_status_timeout.start();
|
|
362
358
|
self.repair_timeout.start();
|
|
363
359
|
}
|
|
364
360
|
|
|
365
361
|
return self;
|
|
366
362
|
}
|
|
367
363
|
|
|
368
|
-
|
|
369
|
-
|
|
364
|
+
/// Free all memory and unref all messages held by the replica
|
|
365
|
+
/// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
|
|
366
|
+
pub fn deinit(self: *Self, allocator: Allocator) void {
|
|
367
|
+
self.journal.deinit(allocator);
|
|
368
|
+
self.clock.deinit(allocator);
|
|
369
|
+
|
|
370
|
+
{
|
|
371
|
+
var it = self.client_table.iterator();
|
|
372
|
+
while (it.next()) |entry| {
|
|
373
|
+
self.message_bus.unref(entry.value_ptr.reply);
|
|
374
|
+
}
|
|
375
|
+
self.client_table.deinit(allocator);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
{
|
|
379
|
+
var it = self.pipeline.iterator();
|
|
380
|
+
while (it.next()) |prepare| {
|
|
381
|
+
self.message_bus.unref(prepare.message);
|
|
382
|
+
for (prepare.ok_from_all_replicas) |message| {
|
|
383
|
+
if (message) |m| self.message_bus.unref(m);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if (self.loopback_queue) |loopback_message| {
|
|
389
|
+
assert(loopback_message.next == null);
|
|
390
|
+
self.message_bus.unref(loopback_message);
|
|
391
|
+
self.loopback_queue = null;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
for (self.start_view_change_from_other_replicas) |message| {
|
|
395
|
+
if (message) |m| self.message_bus.unref(m);
|
|
396
|
+
}
|
|
397
|
+
for (self.do_view_change_from_all_replicas) |message| {
|
|
398
|
+
if (message) |m| self.message_bus.unref(m);
|
|
399
|
+
}
|
|
400
|
+
for (self.nack_prepare_from_other_replicas) |message| {
|
|
401
|
+
if (message) |m| self.message_bus.unref(m);
|
|
402
|
+
}
|
|
370
403
|
}
|
|
371
404
|
|
|
372
405
|
/// Time is measured in logical ticks that are incremented on every call to tick().
|
|
@@ -380,19 +413,26 @@ pub fn Replica(
|
|
|
380
413
|
|
|
381
414
|
self.clock.tick();
|
|
382
415
|
|
|
416
|
+
if (!self.journal.recovered) {
|
|
417
|
+
self.journal.recover();
|
|
418
|
+
return;
|
|
419
|
+
} else {
|
|
420
|
+
assert(!self.journal.recovering);
|
|
421
|
+
}
|
|
422
|
+
|
|
383
423
|
self.ping_timeout.tick();
|
|
384
424
|
self.prepare_timeout.tick();
|
|
385
425
|
self.commit_timeout.tick();
|
|
386
|
-
self.
|
|
387
|
-
self.
|
|
426
|
+
self.normal_status_timeout.tick();
|
|
427
|
+
self.view_change_status_timeout.tick();
|
|
388
428
|
self.view_change_message_timeout.tick();
|
|
389
429
|
self.repair_timeout.tick();
|
|
390
430
|
|
|
391
431
|
if (self.ping_timeout.fired()) self.on_ping_timeout();
|
|
392
432
|
if (self.prepare_timeout.fired()) self.on_prepare_timeout();
|
|
393
433
|
if (self.commit_timeout.fired()) self.on_commit_timeout();
|
|
394
|
-
if (self.
|
|
395
|
-
if (self.
|
|
434
|
+
if (self.normal_status_timeout.fired()) self.on_normal_status_timeout();
|
|
435
|
+
if (self.view_change_status_timeout.fired()) self.on_view_change_status_timeout();
|
|
396
436
|
if (self.view_change_message_timeout.fired()) self.on_view_change_message_timeout();
|
|
397
437
|
if (self.repair_timeout.fired()) self.on_repair_timeout();
|
|
398
438
|
|
|
@@ -412,10 +452,13 @@ pub fn Replica(
|
|
|
412
452
|
});
|
|
413
453
|
|
|
414
454
|
if (message.header.invalid()) |reason| {
|
|
415
|
-
log.
|
|
455
|
+
log.err("{}: on_message: invalid ({s})", .{ self.replica, reason });
|
|
416
456
|
return;
|
|
417
457
|
}
|
|
418
458
|
|
|
459
|
+
// No client or replica should ever send a .reserved message.
|
|
460
|
+
assert(message.header.command != .reserved);
|
|
461
|
+
|
|
419
462
|
if (message.header.cluster != self.cluster) {
|
|
420
463
|
log.warn("{}: on_message: wrong cluster (cluster must be {} not {})", .{
|
|
421
464
|
self.replica,
|
|
@@ -425,6 +468,14 @@ pub fn Replica(
|
|
|
425
468
|
return;
|
|
426
469
|
}
|
|
427
470
|
|
|
471
|
+
if (!self.journal.recovered) {
|
|
472
|
+
self.journal.recover();
|
|
473
|
+
log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
|
|
474
|
+
return;
|
|
475
|
+
} else {
|
|
476
|
+
assert(!self.journal.recovering);
|
|
477
|
+
}
|
|
478
|
+
|
|
428
479
|
assert(message.header.replica < self.replica_count);
|
|
429
480
|
switch (message.header.command) {
|
|
430
481
|
.ping => self.on_ping(message),
|
|
@@ -436,16 +487,26 @@ pub fn Replica(
|
|
|
436
487
|
.start_view_change => self.on_start_view_change(message),
|
|
437
488
|
.do_view_change => self.on_do_view_change(message),
|
|
438
489
|
.start_view => self.on_start_view(message),
|
|
490
|
+
.recovery => self.on_recovery(message),
|
|
491
|
+
.recovery_response => return, // TODO
|
|
439
492
|
.request_start_view => self.on_request_start_view(message),
|
|
440
493
|
.request_prepare => self.on_request_prepare(message),
|
|
441
494
|
.request_headers => self.on_request_headers(message),
|
|
442
495
|
.headers => self.on_headers(message),
|
|
443
496
|
.nack_prepare => self.on_nack_prepare(message),
|
|
444
|
-
|
|
497
|
+
// A replica should never handle misdirected messages intended for a client:
|
|
498
|
+
.eviction, .reply => {
|
|
499
|
+
log.warn("{}: on_message: ignoring misdirected {s} message", .{
|
|
500
|
+
self.replica,
|
|
501
|
+
@tagName(message.header.command),
|
|
502
|
+
});
|
|
503
|
+
return;
|
|
504
|
+
},
|
|
505
|
+
.reserved => unreachable,
|
|
445
506
|
}
|
|
446
507
|
|
|
447
508
|
if (self.loopback_queue) |loopback_message| {
|
|
448
|
-
log.
|
|
509
|
+
log.err("{}: on_message: on_{s}() queued a {s} loopback message with no flush", .{
|
|
449
510
|
self.replica,
|
|
450
511
|
@tagName(message.header.command),
|
|
451
512
|
@tagName(loopback_message.header.command),
|
|
@@ -473,7 +534,15 @@ pub fn Replica(
|
|
|
473
534
|
if (message.header.client > 0) {
|
|
474
535
|
assert(message.header.replica == 0);
|
|
475
536
|
|
|
476
|
-
|
|
537
|
+
// We must only ever send our view number to a client via a pong message if we are
|
|
538
|
+
// in normal status. Otherwise, we may be partitioned from the cluster with a newer
|
|
539
|
+
// view number, leak this to the client, which would then pass this to the cluster
|
|
540
|
+
// in subsequent client requests, which would then ignore these client requests with
|
|
541
|
+
// a newer view number, locking out the client. The principle here is that we must
|
|
542
|
+
// never send view numbers for views that have not yet started.
|
|
543
|
+
if (self.status == .normal) {
|
|
544
|
+
self.send_header_to_client(message.header.client, pong);
|
|
545
|
+
}
|
|
477
546
|
} else if (message.header.replica == self.replica) {
|
|
478
547
|
log.warn("{}: on_ping: ignoring (self)", .{self.replica});
|
|
479
548
|
} else {
|
|
@@ -492,8 +561,7 @@ pub fn Replica(
|
|
|
492
561
|
const t1 = @bitCast(i64, message.header.offset);
|
|
493
562
|
const m2 = self.clock.monotonic();
|
|
494
563
|
|
|
495
|
-
|
|
496
|
-
self.clock.learn(@intCast(u8, message.header.replica), m0, t1, m2);
|
|
564
|
+
self.clock.learn(message.header.replica, m0, t1, m2);
|
|
497
565
|
}
|
|
498
566
|
|
|
499
567
|
/// The primary advances op-number, adds the request to the end of the log, and updates the
|
|
@@ -513,7 +581,7 @@ pub fn Replica(
|
|
|
513
581
|
assert(message.header.view <= self.view); // The client's view may be behind ours.
|
|
514
582
|
|
|
515
583
|
const realtime = self.clock.realtime_synchronized() orelse {
|
|
516
|
-
log.
|
|
584
|
+
log.err("{}: on_request: dropping (clock not synchronized)", .{self.replica});
|
|
517
585
|
return;
|
|
518
586
|
};
|
|
519
587
|
|
|
@@ -531,7 +599,7 @@ pub fn Replica(
|
|
|
531
599
|
message.header.view = self.view;
|
|
532
600
|
message.header.op = self.op + 1;
|
|
533
601
|
message.header.commit = self.commit_max;
|
|
534
|
-
message.header.offset =
|
|
602
|
+
message.header.offset = Journal.next_offset(latest_entry);
|
|
535
603
|
message.header.replica = self.replica;
|
|
536
604
|
message.header.command = .prepare;
|
|
537
605
|
|
|
@@ -597,6 +665,16 @@ pub fn Replica(
|
|
|
597
665
|
return;
|
|
598
666
|
}
|
|
599
667
|
|
|
668
|
+
if (message.header.view < self.view) {
|
|
669
|
+
log.debug("{}: on_prepare: ignoring (older view)", .{self.replica});
|
|
670
|
+
return;
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
if (message.header.view > self.view) {
|
|
674
|
+
log.debug("{}: on_prepare: ignoring (newer view)", .{self.replica});
|
|
675
|
+
return;
|
|
676
|
+
}
|
|
677
|
+
|
|
600
678
|
assert(self.status == .normal);
|
|
601
679
|
assert(message.header.view == self.view);
|
|
602
680
|
assert(self.leader() or self.follower());
|
|
@@ -604,7 +682,7 @@ pub fn Replica(
|
|
|
604
682
|
assert(message.header.op > self.op);
|
|
605
683
|
assert(message.header.op > self.commit_min);
|
|
606
684
|
|
|
607
|
-
if (self.follower()) self.
|
|
685
|
+
if (self.follower()) self.normal_status_timeout.reset();
|
|
608
686
|
|
|
609
687
|
if (message.header.op > self.op + 1) {
|
|
610
688
|
log.debug("{}: on_prepare: newer op", .{self.replica});
|
|
@@ -630,12 +708,6 @@ pub fn Replica(
|
|
|
630
708
|
self.op = message.header.op;
|
|
631
709
|
self.journal.set_entry_as_dirty(message.header);
|
|
632
710
|
|
|
633
|
-
// We have the latest op from the leader and have cleared the view jump barrier:
|
|
634
|
-
if (self.view_jump_barrier) {
|
|
635
|
-
self.view_jump_barrier = false;
|
|
636
|
-
log.debug("{}: on_prepare: cleared view jump barrier", .{self.replica});
|
|
637
|
-
}
|
|
638
|
-
|
|
639
711
|
self.replicate(message);
|
|
640
712
|
self.append(message);
|
|
641
713
|
|
|
@@ -681,6 +753,10 @@ pub fn Replica(
|
|
|
681
753
|
self.commit_pipeline();
|
|
682
754
|
}
|
|
683
755
|
|
|
756
|
+
/// Known issue:
|
|
757
|
+
/// TODO The leader should stand down if it sees too many retries in on_prepare_timeout().
|
|
758
|
+
/// It's possible for the network to be one-way partitioned so that followers don't see the
|
|
759
|
+
/// leader as down, but neither can the leader hear from the followers.
|
|
684
760
|
fn on_commit(self: *Self, message: *const Message) void {
|
|
685
761
|
self.view_jump(message.header);
|
|
686
762
|
|
|
@@ -694,6 +770,11 @@ pub fn Replica(
|
|
|
694
770
|
return;
|
|
695
771
|
}
|
|
696
772
|
|
|
773
|
+
if (message.header.view > self.view) {
|
|
774
|
+
log.debug("{}: on_commit: ignoring (newer view)", .{self.replica});
|
|
775
|
+
return;
|
|
776
|
+
}
|
|
777
|
+
|
|
697
778
|
if (self.leader()) {
|
|
698
779
|
log.warn("{}: on_commit: ignoring (leader)", .{self.replica});
|
|
699
780
|
return;
|
|
@@ -711,13 +792,12 @@ pub fn Replica(
|
|
|
711
792
|
} else if (self.valid_hash_chain("on_commit")) {
|
|
712
793
|
@panic("commit checksum verification failed");
|
|
713
794
|
} else {
|
|
714
|
-
// We may
|
|
715
|
-
// resolving the view jump barrier.
|
|
795
|
+
// We may still be repairing after receiving the start_view message.
|
|
716
796
|
log.debug("{}: on_commit: skipping checksum verification", .{self.replica});
|
|
717
797
|
}
|
|
718
798
|
}
|
|
719
799
|
|
|
720
|
-
self.
|
|
800
|
+
self.normal_status_timeout.reset();
|
|
721
801
|
|
|
722
802
|
self.commit_ops(message.header.commit);
|
|
723
803
|
}
|
|
@@ -768,11 +848,6 @@ pub fn Replica(
|
|
|
768
848
|
return;
|
|
769
849
|
}
|
|
770
850
|
|
|
771
|
-
if (self.view_jump_barrier) {
|
|
772
|
-
log.debug("{}: on_repair: ignoring (view jump barrier)", .{self.replica});
|
|
773
|
-
return;
|
|
774
|
-
}
|
|
775
|
-
|
|
776
851
|
if (self.repair_header(message.header)) {
|
|
777
852
|
assert(self.journal.has_dirty(message.header));
|
|
778
853
|
|
|
@@ -803,6 +878,20 @@ pub fn Replica(
|
|
|
803
878
|
assert(self.status == .view_change);
|
|
804
879
|
assert(message.header.view == self.view);
|
|
805
880
|
|
|
881
|
+
if (self.leader_index(self.view) == self.replica) {
|
|
882
|
+
// If we are the leader of the new view, then wait until we have a message to send a
|
|
883
|
+
// do_view_change message to ourself. The on_do_view_change() handler will panic if
|
|
884
|
+
// we received a start_view_change quorum without a do_view_change to ourself.
|
|
885
|
+
if (self.message_bus.get_message()) |available| {
|
|
886
|
+
self.message_bus.unref(available);
|
|
887
|
+
} else {
|
|
888
|
+
log.err("{}: on_start_view_change: waiting for message for do_view_change", .{
|
|
889
|
+
self.replica,
|
|
890
|
+
});
|
|
891
|
+
return;
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
|
|
806
895
|
// Wait until we have `f` messages (excluding ourself) for quorum:
|
|
807
896
|
assert(self.replica_count > 1);
|
|
808
897
|
const threshold = self.quorum_view_change - 1;
|
|
@@ -815,7 +904,10 @@ pub fn Replica(
|
|
|
815
904
|
|
|
816
905
|
assert(count == threshold);
|
|
817
906
|
assert(self.start_view_change_from_other_replicas[self.replica] == null);
|
|
818
|
-
log.debug("{}: on_start_view_change: quorum received", .{
|
|
907
|
+
log.debug("{}: on_start_view_change: view={} quorum received", .{
|
|
908
|
+
self.replica,
|
|
909
|
+
self.view,
|
|
910
|
+
});
|
|
819
911
|
|
|
820
912
|
assert(!self.start_view_change_quorum);
|
|
821
913
|
assert(!self.do_view_change_quorum);
|
|
@@ -872,10 +964,14 @@ pub fn Replica(
|
|
|
872
964
|
|
|
873
965
|
assert(count == threshold);
|
|
874
966
|
assert(self.do_view_change_from_all_replicas[self.replica] != null);
|
|
875
|
-
log.debug("{}: on_do_view_change: quorum received", .{
|
|
967
|
+
log.debug("{}: on_do_view_change: view={} quorum received", .{
|
|
968
|
+
self.replica,
|
|
969
|
+
self.view,
|
|
970
|
+
});
|
|
876
971
|
|
|
877
|
-
var
|
|
972
|
+
var v: ?u32 = null;
|
|
878
973
|
var k: ?u64 = null;
|
|
974
|
+
var latest = Header.reserved();
|
|
879
975
|
|
|
880
976
|
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
881
977
|
if (received) |m| {
|
|
@@ -884,13 +980,40 @@ pub fn Replica(
|
|
|
884
980
|
assert(m.header.replica == replica);
|
|
885
981
|
assert(m.header.view == self.view);
|
|
886
982
|
|
|
983
|
+
// The latest normal view experienced by this replica:
|
|
984
|
+
// This may be higher than the view in any of the prepare headers.
|
|
985
|
+
var replica_view_normal = @intCast(u32, m.header.offset);
|
|
986
|
+
assert(replica_view_normal < m.header.view);
|
|
987
|
+
|
|
988
|
+
var replica_latest = Header.reserved();
|
|
989
|
+
set_latest_op(self.message_body_as_headers(m), &replica_latest);
|
|
990
|
+
assert(replica_latest.op == m.header.op);
|
|
991
|
+
|
|
992
|
+
log.debug(
|
|
993
|
+
"{}: on_do_view_change: replica={} v'={} op={} commit={} latest={}",
|
|
994
|
+
.{
|
|
995
|
+
self.replica,
|
|
996
|
+
m.header.replica,
|
|
997
|
+
replica_view_normal,
|
|
998
|
+
m.header.op,
|
|
999
|
+
m.header.commit,
|
|
1000
|
+
replica_latest,
|
|
1001
|
+
},
|
|
1002
|
+
);
|
|
1003
|
+
|
|
1004
|
+
if (v == null or replica_view_normal > v.?) {
|
|
1005
|
+
v = replica_view_normal;
|
|
1006
|
+
latest = replica_latest;
|
|
1007
|
+
} else if (replica_view_normal == v.? and replica_latest.op > latest.op) {
|
|
1008
|
+
v = replica_view_normal;
|
|
1009
|
+
latest = replica_latest;
|
|
1010
|
+
}
|
|
1011
|
+
|
|
887
1012
|
if (k == null or m.header.commit > k.?) k = m.header.commit;
|
|
888
|
-
self.set_latest_header(self.message_body_as_headers(m), &latest);
|
|
889
1013
|
}
|
|
890
1014
|
}
|
|
891
1015
|
|
|
892
1016
|
self.set_latest_op_and_k(&latest, k.?, "on_do_view_change");
|
|
893
|
-
assert(!self.view_jump_barrier);
|
|
894
1017
|
|
|
895
1018
|
// Now that we have the latest op in place, repair any other headers:
|
|
896
1019
|
for (self.do_view_change_from_all_replicas) |received| {
|
|
@@ -908,6 +1031,10 @@ pub fn Replica(
|
|
|
908
1031
|
assert(!self.do_view_change_quorum);
|
|
909
1032
|
self.do_view_change_quorum = true;
|
|
910
1033
|
|
|
1034
|
+
self.discard_uncommitted_headers();
|
|
1035
|
+
assert(self.op >= self.commit_max);
|
|
1036
|
+
assert(self.journal.entry_for_op_exact(self.op) != null);
|
|
1037
|
+
|
|
911
1038
|
// Start repairs according to the CTRL protocol:
|
|
912
1039
|
assert(!self.repair_timeout.ticking);
|
|
913
1040
|
self.repair_timeout.start();
|
|
@@ -924,24 +1051,21 @@ pub fn Replica(
|
|
|
924
1051
|
fn on_start_view(self: *Self, message: *const Message) void {
|
|
925
1052
|
if (self.ignore_view_change_message(message)) return;
|
|
926
1053
|
|
|
927
|
-
assert(self.status == .
|
|
1054
|
+
assert(self.status == .view_change or self.status == .normal);
|
|
928
1055
|
assert(message.header.view >= self.view);
|
|
929
1056
|
assert(message.header.replica != self.replica);
|
|
930
1057
|
assert(message.header.replica == self.leader_index(message.header.view));
|
|
931
1058
|
|
|
932
1059
|
self.view_jump(message.header);
|
|
933
1060
|
|
|
934
|
-
|
|
935
|
-
// same view in normal status if a view jump barrier exists that needs to be cleared:
|
|
936
|
-
assert(self.status == .view_change or self.view_jump_barrier);
|
|
1061
|
+
assert(self.status == .view_change);
|
|
937
1062
|
assert(message.header.view == self.view);
|
|
938
1063
|
|
|
939
1064
|
var latest = Header.reserved();
|
|
940
|
-
|
|
1065
|
+
set_latest_op(self.message_body_as_headers(message), &latest);
|
|
941
1066
|
assert(latest.op == message.header.op);
|
|
942
1067
|
|
|
943
1068
|
self.set_latest_op_and_k(&latest, message.header.commit, "on_start_view");
|
|
944
|
-
assert(!self.view_jump_barrier);
|
|
945
1069
|
|
|
946
1070
|
// Now that we have the latest op in place, repair any other headers:
|
|
947
1071
|
for (self.message_body_as_headers(message)) |*h| {
|
|
@@ -974,7 +1098,7 @@ pub fn Replica(
|
|
|
974
1098
|
assert(self.leader());
|
|
975
1099
|
|
|
976
1100
|
const start_view = self.create_view_change_message(.start_view) orelse {
|
|
977
|
-
log.
|
|
1101
|
+
log.err("{}: on_request_start_view: dropping start_view, no message available", .{
|
|
978
1102
|
self.replica,
|
|
979
1103
|
});
|
|
980
1104
|
return;
|
|
@@ -990,6 +1114,69 @@ pub fn Replica(
|
|
|
990
1114
|
self.send_message_to_replica(message.header.replica, start_view);
|
|
991
1115
|
}
|
|
992
1116
|
|
|
1117
|
+
/// TODO This is a work in progress (out of scope for the bounty)
|
|
1118
|
+
fn on_recovery(self: *Self, message: *const Message) void {
|
|
1119
|
+
if (self.status != .normal) {
|
|
1120
|
+
log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
|
|
1121
|
+
return;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if (message.header.replica == self.replica) {
|
|
1125
|
+
log.warn("{}: on_recovery: ignoring (self)", .{self.replica});
|
|
1126
|
+
return;
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
const response = self.message_bus.get_message() orelse {
|
|
1130
|
+
log.err("{}: on_recovery: ignoring (waiting for message)", .{self.replica});
|
|
1131
|
+
return;
|
|
1132
|
+
};
|
|
1133
|
+
defer self.message_bus.unref(response);
|
|
1134
|
+
|
|
1135
|
+
response.header.* = .{
|
|
1136
|
+
.command = .recovery_response,
|
|
1137
|
+
.cluster = self.cluster,
|
|
1138
|
+
.context = message.header.context,
|
|
1139
|
+
.replica = self.replica,
|
|
1140
|
+
.view = self.view,
|
|
1141
|
+
.op = self.op,
|
|
1142
|
+
.commit = self.commit_max,
|
|
1143
|
+
};
|
|
1144
|
+
|
|
1145
|
+
const count_max = 8; // The number of prepare headers to include in the body.
|
|
1146
|
+
|
|
1147
|
+
const size_max = @sizeOf(Header) * std.math.min(
|
|
1148
|
+
std.math.max(@divFloor(response.buffer.len, @sizeOf(Header)), 2),
|
|
1149
|
+
1 + count_max,
|
|
1150
|
+
);
|
|
1151
|
+
assert(size_max > @sizeOf(Header));
|
|
1152
|
+
|
|
1153
|
+
const count = self.journal.copy_latest_headers_between(
|
|
1154
|
+
0,
|
|
1155
|
+
self.op,
|
|
1156
|
+
std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
|
|
1157
|
+
);
|
|
1158
|
+
|
|
1159
|
+
// We expect that self.op always exists.
|
|
1160
|
+
assert(count > 0);
|
|
1161
|
+
|
|
1162
|
+
response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
1163
|
+
|
|
1164
|
+
response.header.set_checksum_body(response.body());
|
|
1165
|
+
response.header.set_checksum();
|
|
1166
|
+
|
|
1167
|
+
assert(self.status == .normal);
|
|
1168
|
+
// The checksum for a recovery message is deterministic, and cannot be used as a nonce:
|
|
1169
|
+
assert(response.header.context != message.header.checksum);
|
|
1170
|
+
|
|
1171
|
+
self.send_message_to_replica(message.header.replica, response);
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
/// TODO This is a work in progress (out of scope for the bounty)
|
|
1175
|
+
fn on_recovery_response(self: *Self, message: *Message) void {
|
|
1176
|
+
_ = self;
|
|
1177
|
+
_ = message;
|
|
1178
|
+
}
|
|
1179
|
+
|
|
993
1180
|
fn on_request_prepare(self: *Self, message: *const Message) void {
|
|
994
1181
|
if (self.ignore_repair_message(message)) return;
|
|
995
1182
|
|
|
@@ -1008,6 +1195,15 @@ pub fn Replica(
|
|
|
1008
1195
|
if (!self.journal.dirty.bit(op)) {
|
|
1009
1196
|
assert(!self.journal.faulty.bit(op));
|
|
1010
1197
|
|
|
1198
|
+
log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
|
|
1199
|
+
self.replica,
|
|
1200
|
+
op,
|
|
1201
|
+
checksum,
|
|
1202
|
+
});
|
|
1203
|
+
|
|
1204
|
+
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1205
|
+
// this particular destination replica.
|
|
1206
|
+
|
|
1011
1207
|
self.journal.read_prepare(
|
|
1012
1208
|
on_request_prepare_read,
|
|
1013
1209
|
op,
|
|
@@ -1018,6 +1214,12 @@ pub fn Replica(
|
|
|
1018
1214
|
// We have guaranteed the prepare and our copy is clean (not safe to nack).
|
|
1019
1215
|
return;
|
|
1020
1216
|
} else if (self.journal.faulty.bit(op)) {
|
|
1217
|
+
log.debug("{}: on_request_prepare: op={} checksum={} faulty", .{
|
|
1218
|
+
self.replica,
|
|
1219
|
+
op,
|
|
1220
|
+
checksum,
|
|
1221
|
+
});
|
|
1222
|
+
|
|
1021
1223
|
// We have gauranteed the prepare but our copy is faulty (not safe to nack).
|
|
1022
1224
|
return;
|
|
1023
1225
|
}
|
|
@@ -1032,6 +1234,13 @@ pub fn Replica(
|
|
|
1032
1234
|
if (self.journal.entry_for_op_exact_with_checksum(op, checksum) != null) {
|
|
1033
1235
|
assert(self.journal.dirty.bit(op) and !self.journal.faulty.bit(op));
|
|
1034
1236
|
}
|
|
1237
|
+
|
|
1238
|
+
log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
|
|
1239
|
+
self.replica,
|
|
1240
|
+
op,
|
|
1241
|
+
checksum,
|
|
1242
|
+
});
|
|
1243
|
+
|
|
1035
1244
|
self.send_header_to_replica(message.header.replica, .{
|
|
1036
1245
|
.command = .nack_prepare,
|
|
1037
1246
|
.context = checksum.?,
|
|
@@ -1044,7 +1253,17 @@ pub fn Replica(
|
|
|
1044
1253
|
}
|
|
1045
1254
|
|
|
1046
1255
|
fn on_request_prepare_read(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
|
|
1047
|
-
const message = prepare orelse
|
|
1256
|
+
const message = prepare orelse {
|
|
1257
|
+
log.debug("{}: on_request_prepare_read: prepare=null", .{self.replica});
|
|
1258
|
+
return;
|
|
1259
|
+
};
|
|
1260
|
+
|
|
1261
|
+
log.debug("{}: on_request_prepare_read: op={} checksum={} sending to replica={}", .{
|
|
1262
|
+
self.replica,
|
|
1263
|
+
message.header.op,
|
|
1264
|
+
message.header.checksum,
|
|
1265
|
+
destination_replica.?,
|
|
1266
|
+
});
|
|
1048
1267
|
|
|
1049
1268
|
assert(destination_replica.? != self.replica);
|
|
1050
1269
|
self.send_message_to_replica(destination_replica.?, message);
|
|
@@ -1058,8 +1277,10 @@ pub fn Replica(
|
|
|
1058
1277
|
assert(message.header.replica != self.replica);
|
|
1059
1278
|
|
|
1060
1279
|
const response = self.message_bus.get_message() orelse {
|
|
1061
|
-
log.
|
|
1280
|
+
log.err("{}: on_request_headers: ignoring (op={}..{}, no message available)", .{
|
|
1062
1281
|
self.replica,
|
|
1282
|
+
message.header.commit,
|
|
1283
|
+
message.header.op,
|
|
1063
1284
|
});
|
|
1064
1285
|
return;
|
|
1065
1286
|
};
|
|
@@ -1095,7 +1316,11 @@ pub fn Replica(
|
|
|
1095
1316
|
);
|
|
1096
1317
|
|
|
1097
1318
|
if (count == 0) {
|
|
1098
|
-
log.debug("{}: on_request_headers: no headers
|
|
1319
|
+
log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
|
|
1320
|
+
self.replica,
|
|
1321
|
+
op_min,
|
|
1322
|
+
op_max,
|
|
1323
|
+
});
|
|
1099
1324
|
return;
|
|
1100
1325
|
}
|
|
1101
1326
|
|
|
@@ -1130,17 +1355,59 @@ pub fn Replica(
|
|
|
1130
1355
|
return;
|
|
1131
1356
|
}
|
|
1132
1357
|
|
|
1358
|
+
if (message.header.context != checksum) {
|
|
1359
|
+
log.debug("{}: on_nack_prepare: ignoring (repairing another checksum)", .{
|
|
1360
|
+
self.replica,
|
|
1361
|
+
});
|
|
1362
|
+
return;
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1133
1365
|
// Followers may not send a `nack_prepare` for a different checksum:
|
|
1134
|
-
//
|
|
1366
|
+
// However our op may change in between sending the request and getting the nack.
|
|
1367
|
+
assert(message.header.op == op);
|
|
1135
1368
|
assert(message.header.context == checksum);
|
|
1136
1369
|
|
|
1137
|
-
//
|
|
1138
|
-
//
|
|
1370
|
+
// Here are what our nack quorums look like, if we know our op is faulty:
|
|
1371
|
+
// These are for various replication quorums under Flexible Paxos.
|
|
1372
|
+
// We need to have enough nacks to guarantee that `quorum_replication` was not reached,
|
|
1373
|
+
// because if the replication quorum was reached, then it may have been committed.
|
|
1374
|
+
// We add `1` in each case because our op is faulty and may have been counted.
|
|
1375
|
+
//
|
|
1376
|
+
// replica_count=2 - quorum_replication=2 + 1 = 0 + 1 = 1 nacks required
|
|
1377
|
+
// replica_count=3 - quorum_replication=2 + 1 = 1 + 1 = 2 nacks required
|
|
1378
|
+
// replica_count=4 - quorum_replication=2 + 1 = 2 + 1 = 3 nacks required
|
|
1379
|
+
// replica_count=4 - quorum_replication=3 + 1 = 1 + 1 = 2 nacks required
|
|
1380
|
+
// replica_count=5 - quorum_replication=2 + 1 = 3 + 1 = 4 nacks required
|
|
1381
|
+
// replica_count=5 - quorum_replication=3 + 1 = 2 + 1 = 3 nacks required
|
|
1382
|
+
//
|
|
1383
|
+
// Otherwise, if we know we do not have the op, then we can exclude ourselves.
|
|
1139
1384
|
assert(self.replica_count > 1);
|
|
1385
|
+
|
|
1140
1386
|
const threshold = if (self.journal.faulty.bit(op))
|
|
1141
|
-
self.
|
|
1387
|
+
self.replica_count - self.quorum_replication + 1
|
|
1142
1388
|
else
|
|
1143
|
-
self.
|
|
1389
|
+
self.replica_count - self.quorum_replication;
|
|
1390
|
+
|
|
1391
|
+
if (threshold == 0) {
|
|
1392
|
+
assert(self.replica_count == 2);
|
|
1393
|
+
assert(!self.journal.faulty.bit(op));
|
|
1394
|
+
|
|
1395
|
+
// This is a special case for a cluster-of-two, handled in `repair_prepare()`.
|
|
1396
|
+
log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
|
|
1397
|
+
self.replica,
|
|
1398
|
+
});
|
|
1399
|
+
return;
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
|
|
1403
|
+
self.replica,
|
|
1404
|
+
self.quorum_replication,
|
|
1405
|
+
threshold,
|
|
1406
|
+
});
|
|
1407
|
+
|
|
1408
|
+
// We should never expect to receive a nack from ourselves:
|
|
1409
|
+
// Detect if we ever set `threshold` to `quorum_view_change` for a cluster-of-two again.
|
|
1410
|
+
assert(threshold < self.replica_count);
|
|
1144
1411
|
|
|
1145
1412
|
// Wait until we have `threshold` messages for quorum:
|
|
1146
1413
|
const count = self.add_message_and_receive_quorum_exactly_once(
|
|
@@ -1153,32 +1420,8 @@ pub fn Replica(
|
|
|
1153
1420
|
assert(self.nack_prepare_from_other_replicas[self.replica] == null);
|
|
1154
1421
|
log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
|
|
1155
1422
|
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
assert(op > self.commit_max);
|
|
1159
|
-
assert(op <= self.op);
|
|
1160
|
-
assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
|
|
1161
|
-
assert(self.journal.dirty.bit(op));
|
|
1162
|
-
|
|
1163
|
-
log.debug("{}: on_nack_prepare: discarding uncommitted ops={}..{}", .{
|
|
1164
|
-
self.replica,
|
|
1165
|
-
op,
|
|
1166
|
-
self.op,
|
|
1167
|
-
});
|
|
1168
|
-
|
|
1169
|
-
self.journal.remove_entries_from(op);
|
|
1170
|
-
self.op = op - 1;
|
|
1171
|
-
|
|
1172
|
-
assert(self.journal.entry_for_op(op) == null);
|
|
1173
|
-
assert(!self.journal.dirty.bit(op));
|
|
1174
|
-
assert(!self.journal.faulty.bit(op));
|
|
1175
|
-
|
|
1176
|
-
// We require that `self.op` always exists. Rewinding `self.op` could change that.
|
|
1177
|
-
// However, we do this only as the leader within a view change, with all headers intact.
|
|
1178
|
-
assert(self.journal.entry_for_op_exact(self.op) != null);
|
|
1179
|
-
|
|
1423
|
+
self.discard_uncommitted_ops_from(op, checksum);
|
|
1180
1424
|
self.reset_quorum_nack_prepare();
|
|
1181
|
-
|
|
1182
1425
|
self.repair();
|
|
1183
1426
|
}
|
|
1184
1427
|
|
|
@@ -1192,11 +1435,6 @@ pub fn Replica(
|
|
|
1192
1435
|
// We expect at least one header in the body, or otherwise no response to our request.
|
|
1193
1436
|
assert(message.header.size > @sizeOf(Header));
|
|
1194
1437
|
|
|
1195
|
-
if (self.view_jump_barrier) {
|
|
1196
|
-
log.debug("{}: on_headers: ignoring (view jump barrier)", .{self.replica});
|
|
1197
|
-
return;
|
|
1198
|
-
}
|
|
1199
|
-
|
|
1200
1438
|
var op_min: ?u64 = null;
|
|
1201
1439
|
var op_max: ?u64 = null;
|
|
1202
1440
|
for (self.message_body_as_headers(message)) |*h| {
|
|
@@ -1232,7 +1470,7 @@ pub fn Replica(
|
|
|
1232
1470
|
assert(self.status == .normal);
|
|
1233
1471
|
assert(self.leader());
|
|
1234
1472
|
|
|
1235
|
-
const prepare = self.pipeline.
|
|
1473
|
+
const prepare = self.pipeline.head_ptr().?;
|
|
1236
1474
|
assert(prepare.message.header.command == .prepare);
|
|
1237
1475
|
|
|
1238
1476
|
if (prepare.ok_quorum_received) {
|
|
@@ -1261,10 +1499,24 @@ pub fn Replica(
|
|
|
1261
1499
|
|
|
1262
1500
|
log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
|
|
1263
1501
|
assert(prepare.ok_from_all_replicas[self.replica] == null);
|
|
1502
|
+
|
|
1503
|
+
// We may be slow and waiting for the write to complete.
|
|
1504
|
+
//
|
|
1505
|
+
// We may even have maxed out our IO depth and been unable to initiate the write,
|
|
1506
|
+
// which can happen if `config.pipelining_max` exceeds `config.io_depth_write`.
|
|
1507
|
+
// This can lead to deadlock for a cluster of one or two (if we do not retry here),
|
|
1508
|
+
// since there is no other way for the leader to repair the dirty op because no
|
|
1509
|
+
// other replica has it.
|
|
1510
|
+
//
|
|
1511
|
+
// Retry the write through `on_repair()` which will work out which is which.
|
|
1512
|
+
// We do expect that the op would have been run through `on_prepare()` already.
|
|
1513
|
+
assert(prepare.message.header.op <= self.op);
|
|
1514
|
+
self.on_repair(prepare.message);
|
|
1515
|
+
|
|
1264
1516
|
return;
|
|
1265
1517
|
}
|
|
1266
1518
|
|
|
1267
|
-
self.prepare_timeout.backoff(
|
|
1519
|
+
self.prepare_timeout.backoff(self.prng.random());
|
|
1268
1520
|
|
|
1269
1521
|
assert(waiting_len <= self.replica_count);
|
|
1270
1522
|
for (waiting[0..waiting_len]) |replica| {
|
|
@@ -1277,7 +1529,7 @@ pub fn Replica(
|
|
|
1277
1529
|
}
|
|
1278
1530
|
|
|
1279
1531
|
// Cycle through the list to reach live replicas and get around partitions:
|
|
1280
|
-
assert
|
|
1532
|
+
// We do not assert `prepare_timeout.attempts > 0` since the counter may wrap back to 0.
|
|
1281
1533
|
const replica = waiting[self.prepare_timeout.attempts % waiting_len];
|
|
1282
1534
|
assert(replica != self.replica);
|
|
1283
1535
|
|
|
@@ -1305,13 +1557,13 @@ pub fn Replica(
|
|
|
1305
1557
|
});
|
|
1306
1558
|
}
|
|
1307
1559
|
|
|
1308
|
-
fn
|
|
1560
|
+
fn on_normal_status_timeout(self: *Self) void {
|
|
1309
1561
|
assert(self.status == .normal);
|
|
1310
1562
|
assert(self.follower());
|
|
1311
1563
|
self.transition_to_view_change_status(self.view + 1);
|
|
1312
1564
|
}
|
|
1313
1565
|
|
|
1314
|
-
fn
|
|
1566
|
+
fn on_view_change_status_timeout(self: *Self) void {
|
|
1315
1567
|
assert(self.status == .view_change);
|
|
1316
1568
|
self.transition_to_view_change_status(self.view + 1);
|
|
1317
1569
|
}
|
|
@@ -1433,7 +1685,6 @@ pub fn Replica(
|
|
|
1433
1685
|
|
|
1434
1686
|
/// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
|
|
1435
1687
|
fn ascending_viewstamps(
|
|
1436
|
-
self: *Self,
|
|
1437
1688
|
a: *const Header,
|
|
1438
1689
|
b: *const Header,
|
|
1439
1690
|
) bool {
|
|
@@ -1475,6 +1726,8 @@ pub fn Replica(
|
|
|
1475
1726
|
}
|
|
1476
1727
|
|
|
1477
1728
|
/// Commit ops up to commit number `commit` (inclusive).
|
|
1729
|
+
/// A function which calls `commit_ops()` to set `commit_max` must first call `view_jump()`.
|
|
1730
|
+
/// Otherwise, we may fork the log.
|
|
1478
1731
|
fn commit_ops(self: *Self, commit: u64) void {
|
|
1479
1732
|
// TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
|
|
1480
1733
|
// Be careful of concurrency when doing this, as successive view changes can happen quickly.
|
|
@@ -1504,9 +1757,15 @@ pub fn Replica(
|
|
|
1504
1757
|
return;
|
|
1505
1758
|
}
|
|
1506
1759
|
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1760
|
+
// We check the hash chain before we read each op, rather than once upfront, because
|
|
1761
|
+
// it's possible for `commit_max` to change while we read asynchronously, after we
|
|
1762
|
+
// validate the hash chain.
|
|
1763
|
+
//
|
|
1764
|
+
// We therefore cannot keep committing until we reach `commit_max`. We need to verify
|
|
1765
|
+
// the hash chain before each read. Once verified (before the read) we can commit in the
|
|
1766
|
+
// callback after the read, but if we see a change we need to stop committing any
|
|
1767
|
+
// further ops, because `commit_max` may have been bumped and may refer to a different
|
|
1768
|
+
// op.
|
|
1510
1769
|
|
|
1511
1770
|
assert(!self.committing);
|
|
1512
1771
|
self.committing = true;
|
|
@@ -1520,6 +1779,12 @@ pub fn Replica(
|
|
|
1520
1779
|
assert(self.commit_min <= self.commit_max);
|
|
1521
1780
|
assert(self.commit_min <= self.op);
|
|
1522
1781
|
|
|
1782
|
+
if (!self.valid_hash_chain("commit_ops_read")) {
|
|
1783
|
+
self.committing = false;
|
|
1784
|
+
return;
|
|
1785
|
+
}
|
|
1786
|
+
assert(self.op >= self.commit_max);
|
|
1787
|
+
|
|
1523
1788
|
// We may receive commit numbers for ops we do not yet have (`commit_max > self.op`):
|
|
1524
1789
|
// Even a naive state transfer may fail to correct for this.
|
|
1525
1790
|
if (self.commit_min < self.commit_max and self.commit_min < self.op) {
|
|
@@ -1571,7 +1836,7 @@ pub fn Replica(
|
|
|
1571
1836
|
|
|
1572
1837
|
// TODO We can optimize this to commit into the client table reply if it exists.
|
|
1573
1838
|
const reply = self.message_bus.get_message() orelse {
|
|
1574
|
-
log.
|
|
1839
|
+
log.err("{}: commit_ops_commit: waiting for message", .{self.replica});
|
|
1575
1840
|
return;
|
|
1576
1841
|
};
|
|
1577
1842
|
defer self.message_bus.unref(reply);
|
|
@@ -1594,12 +1859,17 @@ pub fn Replica(
|
|
|
1594
1859
|
assert(prepare.header.op == self.commit_min + 1);
|
|
1595
1860
|
assert(prepare.header.op <= self.op);
|
|
1596
1861
|
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1862
|
+
// If we are a follower committing through `commit_ops()` then a view change may have
|
|
1863
|
+
// happened since we last checked in `commit_ops_read()`. However, this would relate to
|
|
1864
|
+
// subsequent ops, since by now we have already verified the hash chain for this commit.
|
|
1600
1865
|
|
|
1601
|
-
|
|
1866
|
+
assert(self.journal.entry_for_op_exact(self.commit_min).?.checksum ==
|
|
1867
|
+
prepare.header.parent);
|
|
1868
|
+
|
|
1869
|
+
log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
|
|
1602
1870
|
self.replica,
|
|
1871
|
+
self.view,
|
|
1872
|
+
self.leader_index(self.view) == self.replica,
|
|
1603
1873
|
prepare.header.op,
|
|
1604
1874
|
prepare.header.checksum,
|
|
1605
1875
|
@tagName(prepare.header.operation.cast(StateMachine)),
|
|
@@ -1657,7 +1927,7 @@ pub fn Replica(
|
|
|
1657
1927
|
assert(self.leader());
|
|
1658
1928
|
assert(self.pipeline.count > 0);
|
|
1659
1929
|
|
|
1660
|
-
while (self.pipeline.
|
|
1930
|
+
while (self.pipeline.head_ptr()) |prepare| {
|
|
1661
1931
|
assert(self.pipeline.count > 0);
|
|
1662
1932
|
assert(self.commit_min == self.commit_max);
|
|
1663
1933
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
@@ -1679,7 +1949,7 @@ pub fn Replica(
|
|
|
1679
1949
|
// TODO We can optimize this to commit into the client table reply if it exists.
|
|
1680
1950
|
const reply = self.message_bus.get_message() orelse {
|
|
1681
1951
|
// Eventually handled by on_prepare_timeout().
|
|
1682
|
-
log.
|
|
1952
|
+
log.err("{}: commit_pipeline: waiting for message", .{self.replica});
|
|
1683
1953
|
return;
|
|
1684
1954
|
};
|
|
1685
1955
|
defer self.message_bus.unref(reply);
|
|
@@ -1760,13 +2030,14 @@ pub fn Replica(
|
|
|
1760
2030
|
assert(request == 0);
|
|
1761
2031
|
|
|
1762
2032
|
// For correctness, it's critical that all replicas evict deterministically:
|
|
1763
|
-
// We cannot depend on `HashMap.capacity()` since `HashMap.
|
|
1764
|
-
// across
|
|
1765
|
-
// which must be the same across all replicas, and must not change after
|
|
1766
|
-
//
|
|
1767
|
-
//
|
|
2033
|
+
// We cannot depend on `HashMap.capacity()` since `HashMap.ensureTotalCapacity()` may
|
|
2034
|
+
// change across versions of the Zig std lib. We therefore rely on `config.clients_max`,
|
|
2035
|
+
// which must be the same across all replicas, and must not change after initializing a
|
|
2036
|
+
// cluster.
|
|
2037
|
+
// We also do not depend on `HashMap.valueIterator()` being deterministic here. However,
|
|
2038
|
+
// we do require that all entries have different commit numbers and are iterated.
|
|
1768
2039
|
// This ensures that we will always pick the entry with the oldest commit number.
|
|
1769
|
-
// We also
|
|
2040
|
+
// We also check that a client has only one entry in the hash map (or it's buggy).
|
|
1770
2041
|
const clients = self.client_table.count();
|
|
1771
2042
|
assert(clients <= config.clients_max);
|
|
1772
2043
|
if (clients == config.clients_max) {
|
|
@@ -1791,7 +2062,7 @@ pub fn Replica(
|
|
|
1791
2062
|
}
|
|
1792
2063
|
}
|
|
1793
2064
|
assert(iterated == clients);
|
|
1794
|
-
log.
|
|
2065
|
+
log.err("{}: create_client_table_entry: clients={}/{} evicting client={}", .{
|
|
1795
2066
|
self.replica,
|
|
1796
2067
|
clients,
|
|
1797
2068
|
config.clients_max,
|
|
@@ -1833,11 +2104,23 @@ pub fn Replica(
|
|
|
1833
2104
|
.cluster = self.cluster,
|
|
1834
2105
|
.replica = self.replica,
|
|
1835
2106
|
.view = self.view,
|
|
2107
|
+
// The latest normal view (as specified in the 2012 paper) is different to the view
|
|
2108
|
+
// number contained in the prepare headers we include in the body. The former shows
|
|
2109
|
+
// how recent a view change the replica participated in, which may be much higher.
|
|
2110
|
+
// We use the `offset` field to send this in addition to the current view number:
|
|
2111
|
+
.offset = if (command == .do_view_change) self.view_normal else 0,
|
|
1836
2112
|
.op = self.op,
|
|
1837
2113
|
.commit = self.commit_max,
|
|
1838
2114
|
};
|
|
1839
2115
|
|
|
1840
|
-
|
|
2116
|
+
// CRITICAL: The number of prepare headers to include in the body:
|
|
2117
|
+
// We must provide enough headers to cover all uncommitted headers so that the new
|
|
2118
|
+
// leader (if we are in a view change) can decide whether to discard uncommitted headers
|
|
2119
|
+
// that cannot be repaired because they are gaps, and this must be relative to the
|
|
2120
|
+
// cluster as a whole (not relative to the difference between our op and commit number)
|
|
2121
|
+
// as otherwise we would break correctness.
|
|
2122
|
+
const count_max = config.pipelining_max;
|
|
2123
|
+
assert(count_max > 0);
|
|
1841
2124
|
|
|
1842
2125
|
const size_max = @sizeOf(Header) * std.math.min(
|
|
1843
2126
|
std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
|
|
@@ -1865,7 +2148,7 @@ pub fn Replica(
|
|
|
1865
2148
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
1866
2149
|
fn create_message_from_header(self: *Self, header: Header) ?*Message {
|
|
1867
2150
|
assert(header.replica == self.replica);
|
|
1868
|
-
assert(header.view == self.view);
|
|
2151
|
+
assert(header.view == self.view or header.command == .request_start_view);
|
|
1869
2152
|
assert(header.size == @sizeOf(Header));
|
|
1870
2153
|
|
|
1871
2154
|
const message = self.message_bus.pool.get_header_only_message() orelse return null;
|
|
@@ -1878,6 +2161,112 @@ pub fn Replica(
|
|
|
1878
2161
|
return message.ref();
|
|
1879
2162
|
}
|
|
1880
2163
|
|
|
2164
|
+
/// Discards uncommitted headers during a view change before the new leader starts the view.
|
|
2165
|
+
/// This is required to maximize availability in the presence of storage faults.
|
|
2166
|
+
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
2167
|
+
///
|
|
2168
|
+
/// It's possible for the new leader to have done an op jump in a previous view, and so
|
|
2169
|
+
/// introduced a header gap for an op, which was then discarded by another leader during a
|
|
2170
|
+
/// newer view change, before surviving into this view as a gap because our latest op was
|
|
2171
|
+
/// set as the latest op for the quorum.
|
|
2172
|
+
///
|
|
2173
|
+
/// In this case, it may be impossible for the new leader to repair the missing header since
|
|
2174
|
+
/// the rest of the cluster may have already discarded it. We therefore iterate over our
|
|
2175
|
+
/// uncommitted header gaps and compare them with the quorum of do_view_change messages
|
|
2176
|
+
/// received from other replicas, before starting the new view, to discard any that may be
|
|
2177
|
+
/// impossible to repair.
|
|
2178
|
+
fn discard_uncommitted_headers(self: *Self) void {
|
|
2179
|
+
assert(self.status == .view_change);
|
|
2180
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
2181
|
+
assert(self.do_view_change_quorum);
|
|
2182
|
+
assert(!self.repair_timeout.ticking);
|
|
2183
|
+
assert(self.op >= self.commit_max);
|
|
2184
|
+
assert(self.replica_count > 1);
|
|
2185
|
+
|
|
2186
|
+
const threshold = self.replica_count - self.quorum_replication;
|
|
2187
|
+
if (threshold == 0) {
|
|
2188
|
+
assert(self.replica_count == 2);
|
|
2189
|
+
return;
|
|
2190
|
+
}
|
|
2191
|
+
|
|
2192
|
+
var op = self.op;
|
|
2193
|
+
while (op > self.commit_max) : (op -= 1) {
|
|
2194
|
+
if (self.journal.entry_for_op_exact(op) != null) continue;
|
|
2195
|
+
|
|
2196
|
+
log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
|
|
2197
|
+
|
|
2198
|
+
var nacks: usize = 0;
|
|
2199
|
+
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
2200
|
+
if (received) |m| {
|
|
2201
|
+
assert(m.header.command == .do_view_change);
|
|
2202
|
+
assert(m.header.cluster == self.cluster);
|
|
2203
|
+
assert(m.header.replica == replica);
|
|
2204
|
+
assert(m.header.view == self.view);
|
|
2205
|
+
|
|
2206
|
+
if (replica != self.replica) {
|
|
2207
|
+
if (m.header.op < op) nacks += 1;
|
|
2208
|
+
|
|
2209
|
+
log.debug("{}: discard_uncommitted_headers: replica={} op={}", .{
|
|
2210
|
+
self.replica,
|
|
2211
|
+
m.header.replica,
|
|
2212
|
+
m.header.op,
|
|
2213
|
+
});
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
}
|
|
2217
|
+
|
|
2218
|
+
log.debug("{}: discard_uncommitted_headers: op={} nacks={} threshold={}", .{
|
|
2219
|
+
self.replica,
|
|
2220
|
+
op,
|
|
2221
|
+
nacks,
|
|
2222
|
+
threshold,
|
|
2223
|
+
});
|
|
2224
|
+
|
|
2225
|
+
if (nacks >= threshold) {
|
|
2226
|
+
self.journal.remove_entries_from(op);
|
|
2227
|
+
self.op = op - 1;
|
|
2228
|
+
|
|
2229
|
+
assert(self.journal.entry_for_op(op) == null);
|
|
2230
|
+
assert(!self.journal.dirty.bit(op));
|
|
2231
|
+
assert(!self.journal.faulty.bit(op));
|
|
2232
|
+
}
|
|
2233
|
+
}
|
|
2234
|
+
}
|
|
2235
|
+
|
|
2236
|
+
/// Discards uncommitted ops during a view change from after and including `op`.
|
|
2237
|
+
/// This is required to maximize availability in the presence of storage faults.
|
|
2238
|
+
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
2239
|
+
fn discard_uncommitted_ops_from(self: *Self, op: u64, checksum: u128) void {
|
|
2240
|
+
assert(self.status == .view_change);
|
|
2241
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
2242
|
+
assert(self.repairs_allowed());
|
|
2243
|
+
|
|
2244
|
+
assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
|
|
2245
|
+
|
|
2246
|
+
assert(op > self.commit_max);
|
|
2247
|
+
assert(op <= self.op);
|
|
2248
|
+
assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
|
|
2249
|
+
assert(self.journal.dirty.bit(op));
|
|
2250
|
+
|
|
2251
|
+
log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
|
|
2252
|
+
self.replica,
|
|
2253
|
+
op,
|
|
2254
|
+
self.op,
|
|
2255
|
+
self.view,
|
|
2256
|
+
});
|
|
2257
|
+
|
|
2258
|
+
self.journal.remove_entries_from(op);
|
|
2259
|
+
self.op = op - 1;
|
|
2260
|
+
|
|
2261
|
+
assert(self.journal.entry_for_op(op) == null);
|
|
2262
|
+
assert(!self.journal.dirty.bit(op));
|
|
2263
|
+
assert(!self.journal.faulty.bit(op));
|
|
2264
|
+
|
|
2265
|
+
// We require that `self.op` always exists. Rewinding `self.op` could change that.
|
|
2266
|
+
// However, we do this only as the leader within a view change, with all headers intact.
|
|
2267
|
+
assert(self.journal.entry_for_op_exact(self.op) != null);
|
|
2268
|
+
}
|
|
2269
|
+
|
|
1881
2270
|
/// Returns whether the replica is a follower for the current view.
|
|
1882
2271
|
/// This may be used only when the replica status is normal.
|
|
1883
2272
|
fn follower(self: *Self) bool {
|
|
@@ -1898,6 +2287,7 @@ pub fn Replica(
|
|
|
1898
2287
|
if (self.loopback_queue) |message| {
|
|
1899
2288
|
defer self.message_bus.unref(message);
|
|
1900
2289
|
|
|
2290
|
+
assert(message.next == null);
|
|
1901
2291
|
self.loopback_queue = null;
|
|
1902
2292
|
assert(message.header.replica == self.replica);
|
|
1903
2293
|
self.on_message(message);
|
|
@@ -1954,8 +2344,6 @@ pub fn Replica(
|
|
|
1954
2344
|
return true;
|
|
1955
2345
|
}
|
|
1956
2346
|
|
|
1957
|
-
// We should never view jump unless we know what our status should be after the jump:
|
|
1958
|
-
// Otherwise we may be normal before the leader, or in a view change that has completed.
|
|
1959
2347
|
if (message.header.view > self.view) {
|
|
1960
2348
|
log.debug("{}: on_{s}: ignoring (newer view)", .{ self.replica, command });
|
|
1961
2349
|
return true;
|
|
@@ -2068,11 +2456,11 @@ pub fn Replica(
|
|
|
2068
2456
|
// Fall through below to check if we should resend the .register session reply.
|
|
2069
2457
|
} else if (entry.session > message.header.context) {
|
|
2070
2458
|
// The client must not reuse the ephemeral client ID when registering a new session.
|
|
2071
|
-
log.
|
|
2459
|
+
log.err("{}: on_request: ignoring older session (client bug)", .{self.replica});
|
|
2072
2460
|
return true;
|
|
2073
2461
|
} else if (entry.session < message.header.context) {
|
|
2074
2462
|
// This cannot be because of a partition since we check the client's view number.
|
|
2075
|
-
log.
|
|
2463
|
+
log.err("{}: on_request: ignoring newer session (client bug)", .{self.replica});
|
|
2076
2464
|
return true;
|
|
2077
2465
|
}
|
|
2078
2466
|
|
|
@@ -2087,7 +2475,7 @@ pub fn Replica(
|
|
|
2087
2475
|
self.message_bus.send_message_to_client(message.header.client, entry.reply);
|
|
2088
2476
|
return true;
|
|
2089
2477
|
} else {
|
|
2090
|
-
log.
|
|
2478
|
+
log.err("{}: on_request: request collision (client bug)", .{self.replica});
|
|
2091
2479
|
return true;
|
|
2092
2480
|
}
|
|
2093
2481
|
} else if (entry.reply.header.request + 1 == message.header.request) {
|
|
@@ -2097,16 +2485,24 @@ pub fn Replica(
|
|
|
2097
2485
|
return false;
|
|
2098
2486
|
} else {
|
|
2099
2487
|
// The client may have only one request inflight at a time.
|
|
2100
|
-
log.
|
|
2488
|
+
log.err("{}: on_request: ignoring new request (client bug)", .{self.replica});
|
|
2101
2489
|
return true;
|
|
2102
2490
|
}
|
|
2103
2491
|
} else {
|
|
2104
|
-
log.
|
|
2492
|
+
log.err("{}: on_request: ignoring newer request (client bug)", .{self.replica});
|
|
2105
2493
|
return true;
|
|
2106
2494
|
}
|
|
2107
2495
|
} else if (message.header.operation == .register) {
|
|
2108
2496
|
log.debug("{}: on_request: new session", .{self.replica});
|
|
2109
2497
|
return false;
|
|
2498
|
+
} else if (self.pipeline_prepare_for_client(message.header.client)) |_| {
|
|
2499
|
+
// The client registered with the previous leader, which committed and replied back
|
|
2500
|
+
// to the client before the view change, after which the register operation was
|
|
2501
|
+
// reloaded into the pipeline to be driven to completion by the new leader, which
|
|
2502
|
+
// now receives a request from the client that appears to have no session.
|
|
2503
|
+
// However, the session is about to be registered, so we must wait for it to commit.
|
|
2504
|
+
log.debug("{}: on_request: waiting for session to commit", .{self.replica});
|
|
2505
|
+
return true;
|
|
2110
2506
|
} else {
|
|
2111
2507
|
// We must have all commits to know whether a session has been evicted. For example,
|
|
2112
2508
|
// there is the risk of sending an eviction message (even as the leader) if we are
|
|
@@ -2180,7 +2576,7 @@ pub fn Replica(
|
|
|
2180
2576
|
log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
|
|
2181
2577
|
return true;
|
|
2182
2578
|
} else {
|
|
2183
|
-
log.
|
|
2579
|
+
log.err("{}: on_request: ignoring (client forked)", .{self.replica});
|
|
2184
2580
|
return true;
|
|
2185
2581
|
}
|
|
2186
2582
|
}
|
|
@@ -2216,12 +2612,8 @@ pub fn Replica(
|
|
|
2216
2612
|
}
|
|
2217
2613
|
|
|
2218
2614
|
if (message.header.view == self.view and self.status == .normal) {
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
} else {
|
|
2222
|
-
log.debug("{}: on_{s}: ignoring (view started)", .{ self.replica, command });
|
|
2223
|
-
return true;
|
|
2224
|
-
}
|
|
2615
|
+
log.debug("{}: on_{s}: ignoring (view started)", .{ self.replica, command });
|
|
2616
|
+
return true;
|
|
2225
2617
|
}
|
|
2226
2618
|
|
|
2227
2619
|
// These may be caused by faults in the network topology.
|
|
@@ -2293,7 +2685,7 @@ pub fn Replica(
|
|
|
2293
2685
|
assert(self.op + 1 == header.op);
|
|
2294
2686
|
}
|
|
2295
2687
|
|
|
2296
|
-
fn message_body_as_headers(
|
|
2688
|
+
fn message_body_as_headers(_: *Self, message: *const Message) []Header {
|
|
2297
2689
|
// TODO Assert message commands that we expect this to be called for.
|
|
2298
2690
|
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
2299
2691
|
return std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..message.header.size]);
|
|
@@ -2312,8 +2704,8 @@ pub fn Replica(
|
|
|
2312
2704
|
if (a.view == b.view and a.op + 1 == b.op and a.checksum != b.parent) {
|
|
2313
2705
|
assert(a.valid_checksum());
|
|
2314
2706
|
assert(b.valid_checksum());
|
|
2315
|
-
log.
|
|
2316
|
-
log.
|
|
2707
|
+
log.err("{}: panic_if_hash_chain_would_break: a: {}", .{ self.replica, a });
|
|
2708
|
+
log.err("{}: panic_if_hash_chain_would_break: b: {}", .{ self.replica, b });
|
|
2317
2709
|
@panic("hash chain would break");
|
|
2318
2710
|
}
|
|
2319
2711
|
}
|
|
@@ -2332,6 +2724,8 @@ pub fn Replica(
|
|
|
2332
2724
|
assert(prepare.message.header.op == op);
|
|
2333
2725
|
assert(prepare.message.header.parent == parent);
|
|
2334
2726
|
|
|
2727
|
+
// A client may have multiple requests in the pipeline if these were committed by
|
|
2728
|
+
// the previous leader and were reloaded into the pipeline after a view change.
|
|
2335
2729
|
if (prepare.message.header.client == client) return prepare;
|
|
2336
2730
|
|
|
2337
2731
|
parent = prepare.message.header.checksum;
|
|
@@ -2399,26 +2793,10 @@ pub fn Replica(
|
|
|
2399
2793
|
assert(self.commit_min <= self.op);
|
|
2400
2794
|
assert(self.commit_min <= self.commit_max);
|
|
2401
2795
|
|
|
2402
|
-
// TODO Handle case where we are requesting reordered headers that no longer exist.
|
|
2403
|
-
|
|
2404
2796
|
// We expect these always to exist:
|
|
2405
2797
|
assert(self.journal.entry_for_op_exact(self.commit_min) != null);
|
|
2406
2798
|
assert(self.journal.entry_for_op_exact(self.op) != null);
|
|
2407
2799
|
|
|
2408
|
-
// Resolve any view jump by requesting the leader's latest op:
|
|
2409
|
-
if (self.view_jump_barrier) {
|
|
2410
|
-
assert(self.status == .normal);
|
|
2411
|
-
assert(self.follower());
|
|
2412
|
-
log.debug("{}: repair: resolving view jump barrier", .{self.replica});
|
|
2413
|
-
self.send_header_to_replica(self.leader_index(self.view), .{
|
|
2414
|
-
.command = .request_start_view,
|
|
2415
|
-
.cluster = self.cluster,
|
|
2416
|
-
.replica = self.replica,
|
|
2417
|
-
.view = self.view,
|
|
2418
|
-
});
|
|
2419
|
-
return;
|
|
2420
|
-
}
|
|
2421
|
-
|
|
2422
2800
|
// Request outstanding committed prepares to advance our op number:
|
|
2423
2801
|
// This handles the case of an idle cluster, where a follower will not otherwise advance.
|
|
2424
2802
|
// This is not required for correctness, but for durability.
|
|
@@ -2450,14 +2828,15 @@ pub fn Replica(
|
|
|
2450
2828
|
|
|
2451
2829
|
// Request any missing or disconnected headers:
|
|
2452
2830
|
// TODO Snapshots: Ensure that self.commit_min op always exists in the journal.
|
|
2453
|
-
assert(!self.view_jump_barrier);
|
|
2454
2831
|
var broken = self.journal.find_latest_headers_break_between(self.commit_min, self.op);
|
|
2455
2832
|
if (broken) |range| {
|
|
2456
|
-
log.debug("{}: repair:
|
|
2833
|
+
log.debug("{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})", .{
|
|
2457
2834
|
self.replica,
|
|
2835
|
+
self.view,
|
|
2458
2836
|
range.op_min,
|
|
2459
2837
|
range.op_max,
|
|
2460
2838
|
self.commit_min,
|
|
2839
|
+
self.commit_max,
|
|
2461
2840
|
self.op,
|
|
2462
2841
|
});
|
|
2463
2842
|
assert(range.op_min > self.commit_min);
|
|
@@ -2480,7 +2859,6 @@ pub fn Replica(
|
|
|
2480
2859
|
}
|
|
2481
2860
|
|
|
2482
2861
|
// Assert that all headers are now present and connected with a perfect hash chain:
|
|
2483
|
-
assert(!self.view_jump_barrier);
|
|
2484
2862
|
assert(self.op >= self.commit_max);
|
|
2485
2863
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
2486
2864
|
|
|
@@ -2514,15 +2892,13 @@ pub fn Replica(
|
|
|
2514
2892
|
/// * The latest op makes sense of everything else and must not be replaced with a different
|
|
2515
2893
|
/// op or advanced except by the leader in the current view.
|
|
2516
2894
|
///
|
|
2517
|
-
/// * Do not jump to a view in normal status without
|
|
2518
|
-
///
|
|
2519
|
-
/// * Do not commit before resolving the view jump barrier with the leader.
|
|
2895
|
+
/// * Do not jump to a view in normal status without receiving a start_view message.
|
|
2520
2896
|
///
|
|
2521
2897
|
/// * Do not commit until the hash chain between `self.commit_min` and `self.op` is fully
|
|
2522
2898
|
/// connected, to ensure that all the ops in this range are correct.
|
|
2523
2899
|
///
|
|
2524
2900
|
/// * Ensure that `self.commit_max` is never advanced for a newer view without first
|
|
2525
|
-
///
|
|
2901
|
+
/// receiving a start_view message, otherwise `self.commit_max` may refer to different ops.
|
|
2526
2902
|
///
|
|
2527
2903
|
/// * Ensure that `self.op` is never advanced by a repair since repairs may occur in a view
|
|
2528
2904
|
/// change where the view has not yet started.
|
|
@@ -2534,16 +2910,13 @@ pub fn Replica(
|
|
|
2534
2910
|
/// http://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf.
|
|
2535
2911
|
///
|
|
2536
2912
|
fn repair_header(self: *Self, header: *const Header) bool {
|
|
2537
|
-
// Do not try to do any repairs while we cannot trust `self.op`:
|
|
2538
|
-
assert(!self.view_jump_barrier);
|
|
2539
|
-
|
|
2540
2913
|
assert(header.valid_checksum());
|
|
2541
2914
|
assert(header.invalid() == null);
|
|
2542
2915
|
assert(header.command == .prepare);
|
|
2543
2916
|
|
|
2544
2917
|
switch (self.status) {
|
|
2545
2918
|
.normal => assert(header.view <= self.view),
|
|
2546
|
-
.view_change => assert(header.view
|
|
2919
|
+
.view_change => assert(header.view <= self.view),
|
|
2547
2920
|
else => unreachable,
|
|
2548
2921
|
}
|
|
2549
2922
|
|
|
@@ -2557,7 +2930,10 @@ pub fn Replica(
|
|
|
2557
2930
|
// that breaks the chain with self.op. In this case, we must skip the repair to
|
|
2558
2931
|
// avoid overwriting any overlapping op.
|
|
2559
2932
|
} else {
|
|
2560
|
-
log.debug("{}: repair_header: false (changes self.op)", .{
|
|
2933
|
+
log.debug("{}: repair_header: false (changes self.op={})", .{
|
|
2934
|
+
self.replica,
|
|
2935
|
+
self.op,
|
|
2936
|
+
});
|
|
2561
2937
|
return false;
|
|
2562
2938
|
}
|
|
2563
2939
|
}
|
|
@@ -2667,9 +3043,6 @@ pub fn Replica(
|
|
|
2667
3043
|
/// The mistake in this example was not that we ignored the break to the left, which we must
|
|
2668
3044
|
/// do to repair reordered ops, but that we did not check for connection to the right.
|
|
2669
3045
|
fn repair_header_would_connect_hash_chain(self: *Self, header: *const Header) bool {
|
|
2670
|
-
// We must be able to trust `self.op` if this function is to be reliable.
|
|
2671
|
-
assert(!self.view_jump_barrier);
|
|
2672
|
-
|
|
2673
3046
|
var entry = header;
|
|
2674
3047
|
|
|
2675
3048
|
while (entry.op < self.op) {
|
|
@@ -2701,7 +3074,7 @@ pub fn Replica(
|
|
|
2701
3074
|
while (op > 0) {
|
|
2702
3075
|
op -= 1;
|
|
2703
3076
|
if (self.journal.entry_for_op(op)) |neighbor| {
|
|
2704
|
-
if (
|
|
3077
|
+
if (Journal.next_offset(neighbor) > header.offset) return true;
|
|
2705
3078
|
break;
|
|
2706
3079
|
}
|
|
2707
3080
|
}
|
|
@@ -2711,7 +3084,7 @@ pub fn Replica(
|
|
|
2711
3084
|
var op: u64 = header.op + 1;
|
|
2712
3085
|
while (op <= self.op) : (op += 1) {
|
|
2713
3086
|
if (self.journal.entry_for_op(op)) |neighbor| {
|
|
2714
|
-
if (
|
|
3087
|
+
if (Journal.next_offset(header) > neighbor.offset) return true;
|
|
2715
3088
|
break;
|
|
2716
3089
|
}
|
|
2717
3090
|
}
|
|
@@ -2845,22 +3218,12 @@ pub fn Replica(
|
|
|
2845
3218
|
assert(self.repairs_allowed());
|
|
2846
3219
|
assert(self.journal.dirty.len > 0);
|
|
2847
3220
|
|
|
2848
|
-
if (self.journal.writes.available() == 0) {
|
|
2849
|
-
log.debug("{}: repair_prepares: waiting for available IOP", .{self.replica});
|
|
2850
|
-
return;
|
|
2851
|
-
}
|
|
2852
|
-
|
|
2853
|
-
// We may be appending to or repairing the journal concurrently.
|
|
2854
|
-
// We do not want to re-request any of these prepares unnecessarily.
|
|
2855
|
-
// TODO Add journal.writing bits to clear this up (and needed anyway - why?).
|
|
2856
|
-
if (self.journal.writes.executing() > 0) {
|
|
2857
|
-
log.debug("{}: repair_prepares: waiting for dirty bits to settle", .{self.replica});
|
|
2858
|
-
return;
|
|
2859
|
-
}
|
|
2860
|
-
|
|
2861
3221
|
// Request enough prepares to utilize our max IO depth:
|
|
2862
3222
|
var budget = self.journal.writes.available();
|
|
2863
|
-
|
|
3223
|
+
if (budget == 0) {
|
|
3224
|
+
log.debug("{}: repair_prepares: waiting for IOP", .{self.replica});
|
|
3225
|
+
return;
|
|
3226
|
+
}
|
|
2864
3227
|
|
|
2865
3228
|
var op = self.op + 1;
|
|
2866
3229
|
while (op > 0) {
|
|
@@ -2871,20 +3234,21 @@ pub fn Replica(
|
|
|
2871
3234
|
// then we will `request_prepare` from the cluster, set `nack_prepare_op`,
|
|
2872
3235
|
// and stop repairing any further prepares:
|
|
2873
3236
|
// This will also rebroadcast any `request_prepare` every `repair_timeout` tick.
|
|
2874
|
-
self.repair_prepare(op)
|
|
2875
|
-
|
|
2876
|
-
|
|
2877
|
-
|
|
2878
|
-
|
|
2879
|
-
|
|
2880
|
-
|
|
2881
|
-
|
|
3237
|
+
if (self.repair_prepare(op)) {
|
|
3238
|
+
if (self.nack_prepare_op) |nack_prepare_op| {
|
|
3239
|
+
assert(nack_prepare_op == op);
|
|
3240
|
+
assert(self.status == .view_change);
|
|
3241
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
3242
|
+
assert(op > self.commit_max);
|
|
3243
|
+
return;
|
|
3244
|
+
}
|
|
2882
3245
|
|
|
2883
|
-
|
|
2884
|
-
|
|
2885
|
-
|
|
2886
|
-
|
|
2887
|
-
|
|
3246
|
+
// Otherwise, we continue to request prepares until our budget is used:
|
|
3247
|
+
budget -= 1;
|
|
3248
|
+
if (budget == 0) {
|
|
3249
|
+
log.debug("{}: repair_prepares: request budget used", .{self.replica});
|
|
3250
|
+
return;
|
|
3251
|
+
}
|
|
2888
3252
|
}
|
|
2889
3253
|
} else {
|
|
2890
3254
|
assert(!self.journal.faulty.bit(op));
|
|
@@ -2908,16 +3272,29 @@ pub fn Replica(
|
|
|
2908
3272
|
///
|
|
2909
3273
|
/// This is effectively "many-to-one" repair, where a single replica recovers using the
|
|
2910
3274
|
/// resources of many replicas, for faster recovery.
|
|
2911
|
-
fn repair_prepare(self: *Self, op: u64)
|
|
3275
|
+
fn repair_prepare(self: *Self, op: u64) bool {
|
|
2912
3276
|
assert(self.status == .normal or self.status == .view_change);
|
|
2913
3277
|
assert(self.repairs_allowed());
|
|
2914
3278
|
assert(self.journal.dirty.bit(op));
|
|
2915
3279
|
|
|
3280
|
+
const checksum = self.journal.entry_for_op_exact(op).?.checksum;
|
|
3281
|
+
|
|
3282
|
+
// We may be appending to or repairing the journal concurrently.
|
|
3283
|
+
// We do not want to re-request any of these prepares unnecessarily.
|
|
3284
|
+
if (self.journal.writing(op, checksum)) {
|
|
3285
|
+
log.debug("{}: repair_prepare: already writing op={} checksum={}", .{
|
|
3286
|
+
self.replica,
|
|
3287
|
+
op,
|
|
3288
|
+
checksum,
|
|
3289
|
+
});
|
|
3290
|
+
return false;
|
|
3291
|
+
}
|
|
3292
|
+
|
|
2916
3293
|
const request_prepare = Header{
|
|
2917
3294
|
.command = .request_prepare,
|
|
2918
3295
|
// If we request a prepare from a follower, as below, it is critical to pass a checksum:
|
|
2919
3296
|
// Otherwise we could receive different prepares for the same op number.
|
|
2920
|
-
.context =
|
|
3297
|
+
.context = checksum,
|
|
2921
3298
|
.cluster = self.cluster,
|
|
2922
3299
|
.replica = self.replica,
|
|
2923
3300
|
.view = self.view,
|
|
@@ -2928,6 +3305,29 @@ pub fn Replica(
|
|
|
2928
3305
|
// Only the leader is allowed to do repairs in a view change:
|
|
2929
3306
|
assert(self.leader_index(self.view) == self.replica);
|
|
2930
3307
|
|
|
3308
|
+
const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
|
|
3309
|
+
log.debug(
|
|
3310
|
+
"{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
|
|
3311
|
+
.{
|
|
3312
|
+
self.replica,
|
|
3313
|
+
op,
|
|
3314
|
+
checksum,
|
|
3315
|
+
reason,
|
|
3316
|
+
},
|
|
3317
|
+
);
|
|
3318
|
+
|
|
3319
|
+
if (self.replica_count == 2 and !self.journal.faulty.bit(op)) {
|
|
3320
|
+
// This is required to avoid a liveness issue for a cluster-of-two where a new
|
|
3321
|
+
// leader learns of an op during a view change but where the op is faulty on
|
|
3322
|
+
// the old leader. We must immediately roll back the op since it could not have
|
|
3323
|
+
// been committed by the old leader if we know we do not have it, and because
|
|
3324
|
+
// the old leader cannot send a nack_prepare for its faulty copy.
|
|
3325
|
+
// For this to be correct, the recovery protocol must set all headers as faulty,
|
|
3326
|
+
// not only as dirty.
|
|
3327
|
+
self.discard_uncommitted_ops_from(op, checksum);
|
|
3328
|
+
return false;
|
|
3329
|
+
}
|
|
3330
|
+
|
|
2931
3331
|
// Initialize the `nack_prepare` quorum counter for this uncommitted op:
|
|
2932
3332
|
// It is also possible that we may start repairing a lower uncommitted op, having
|
|
2933
3333
|
// initialized `nack_prepare_op` before we learn of a higher uncommitted dirty op,
|
|
@@ -2948,25 +3348,33 @@ pub fn Replica(
|
|
|
2948
3348
|
.nack_prepare,
|
|
2949
3349
|
);
|
|
2950
3350
|
}
|
|
2951
|
-
|
|
2952
|
-
self.replica,
|
|
2953
|
-
op,
|
|
2954
|
-
});
|
|
3351
|
+
|
|
2955
3352
|
assert(self.nack_prepare_op.? == op);
|
|
2956
|
-
assert(request_prepare.context
|
|
3353
|
+
assert(request_prepare.context == checksum);
|
|
2957
3354
|
self.send_header_to_other_replicas(request_prepare);
|
|
2958
3355
|
} else {
|
|
2959
|
-
|
|
3356
|
+
const nature = if (op > self.commit_max) "uncommitted" else "committed";
|
|
3357
|
+
const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
|
|
3358
|
+
log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
|
|
3359
|
+
self.replica,
|
|
3360
|
+
op,
|
|
3361
|
+
checksum,
|
|
3362
|
+
nature,
|
|
3363
|
+
reason,
|
|
3364
|
+
});
|
|
3365
|
+
|
|
2960
3366
|
// We expect that `repair_prepare()` is called in reverse chronological order:
|
|
2961
3367
|
// Any uncommitted ops should have already been dealt with.
|
|
2962
3368
|
// We never roll back committed ops, and thus never regard `nack_prepare` responses.
|
|
2963
3369
|
// Alternatively, we may not be the leader, in which case we do distinguish anyway.
|
|
2964
3370
|
assert(self.nack_prepare_op == null);
|
|
2965
|
-
assert(request_prepare.context
|
|
3371
|
+
assert(request_prepare.context == checksum);
|
|
2966
3372
|
if (self.choose_any_other_replica()) |replica| {
|
|
2967
3373
|
self.send_header_to_replica(replica, request_prepare);
|
|
2968
3374
|
}
|
|
2969
3375
|
}
|
|
3376
|
+
|
|
3377
|
+
return true;
|
|
2970
3378
|
}
|
|
2971
3379
|
|
|
2972
3380
|
fn repairs_allowed(self: *Self) bool {
|
|
@@ -2987,7 +3395,7 @@ pub fn Replica(
|
|
|
2987
3395
|
/// Replicates to the next replica in the configuration (until we get back to the leader):
|
|
2988
3396
|
/// Replication starts and ends with the leader, we never forward back to the leader.
|
|
2989
3397
|
/// Does not flood the network with prepares that have already committed.
|
|
2990
|
-
/// TODO Use recent heartbeat data for next replica to leapfrog if faulty.
|
|
3398
|
+
/// TODO Use recent heartbeat data for next replica to leapfrog if faulty (optimization).
|
|
2991
3399
|
fn replicate(self: *Self, message: *Message) void {
|
|
2992
3400
|
assert(self.status == .normal);
|
|
2993
3401
|
assert(message.header.command == .prepare);
|
|
@@ -3176,7 +3584,7 @@ pub fn Replica(
|
|
|
3176
3584
|
assert(count_start_view_change >= self.quorum_view_change - 1);
|
|
3177
3585
|
|
|
3178
3586
|
const message = self.create_view_change_message(.do_view_change) orelse {
|
|
3179
|
-
log.
|
|
3587
|
+
log.err("{}: send_do_view_change: waiting for message", .{self.replica});
|
|
3180
3588
|
return;
|
|
3181
3589
|
};
|
|
3182
3590
|
defer self.message_bus.unref(message);
|
|
@@ -3195,7 +3603,7 @@ pub fn Replica(
|
|
|
3195
3603
|
assert(self.status == .normal);
|
|
3196
3604
|
assert(self.leader());
|
|
3197
3605
|
|
|
3198
|
-
log.
|
|
3606
|
+
log.err("{}: too many sessions, sending eviction message to client={}", .{
|
|
3199
3607
|
self.replica,
|
|
3200
3608
|
client,
|
|
3201
3609
|
});
|
|
@@ -3211,7 +3619,7 @@ pub fn Replica(
|
|
|
3211
3619
|
|
|
3212
3620
|
fn send_header_to_client(self: *Self, client: u128, header: Header) void {
|
|
3213
3621
|
const message = self.create_message_from_header(header) orelse {
|
|
3214
|
-
log.
|
|
3622
|
+
log.err("{}: no header-only message available, dropping message to client {}", .{
|
|
3215
3623
|
self.replica,
|
|
3216
3624
|
client,
|
|
3217
3625
|
});
|
|
@@ -3224,7 +3632,7 @@ pub fn Replica(
|
|
|
3224
3632
|
|
|
3225
3633
|
fn send_header_to_other_replicas(self: *Self, header: Header) void {
|
|
3226
3634
|
const message = self.create_message_from_header(header) orelse {
|
|
3227
|
-
log.
|
|
3635
|
+
log.err("{}: no header-only message available, dropping message to replicas", .{
|
|
3228
3636
|
self.replica,
|
|
3229
3637
|
});
|
|
3230
3638
|
return;
|
|
@@ -3241,7 +3649,7 @@ pub fn Replica(
|
|
|
3241
3649
|
|
|
3242
3650
|
fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
|
|
3243
3651
|
const message = self.create_message_from_header(header) orelse {
|
|
3244
|
-
log.
|
|
3652
|
+
log.err("{}: no header-only message available, dropping message to replica {}", .{
|
|
3245
3653
|
self.replica,
|
|
3246
3654
|
replica,
|
|
3247
3655
|
});
|
|
@@ -3270,7 +3678,7 @@ pub fn Replica(
|
|
|
3270
3678
|
});
|
|
3271
3679
|
|
|
3272
3680
|
if (message.header.invalid()) |reason| {
|
|
3273
|
-
log.
|
|
3681
|
+
log.err("{}: send_message_to_replica: invalid ({s})", .{ self.replica, reason });
|
|
3274
3682
|
@panic("send_message_to_replica: invalid message");
|
|
3275
3683
|
}
|
|
3276
3684
|
|
|
@@ -3311,6 +3719,7 @@ pub fn Replica(
|
|
|
3311
3719
|
assert(!self.do_view_change_quorum);
|
|
3312
3720
|
assert(message.header.view == self.view);
|
|
3313
3721
|
assert(message.header.replica == self.replica);
|
|
3722
|
+
assert(message.header.op == self.op);
|
|
3314
3723
|
assert(replica == self.leader_index(self.view));
|
|
3315
3724
|
},
|
|
3316
3725
|
.start_view => switch (self.status) {
|
|
@@ -3358,7 +3767,7 @@ pub fn Replica(
|
|
|
3358
3767
|
assert(replica == self.leader_index(self.view));
|
|
3359
3768
|
},
|
|
3360
3769
|
else => {
|
|
3361
|
-
log.
|
|
3770
|
+
log.info("{}: send_message_to_replica: TODO {s}", .{
|
|
3362
3771
|
self.replica,
|
|
3363
3772
|
@tagName(message.header.command),
|
|
3364
3773
|
});
|
|
@@ -3373,7 +3782,9 @@ pub fn Replica(
|
|
|
3373
3782
|
}
|
|
3374
3783
|
}
|
|
3375
3784
|
|
|
3376
|
-
|
|
3785
|
+
/// Finds the header with the highest op number in a slice of headers from a replica.
|
|
3786
|
+
/// Searches only by op number to find the highest `self.op for the replica.
|
|
3787
|
+
fn set_latest_op(headers: []Header, latest: *Header) void {
|
|
3377
3788
|
switch (latest.command) {
|
|
3378
3789
|
.reserved, .prepare => assert(latest.valid_checksum()),
|
|
3379
3790
|
else => unreachable,
|
|
@@ -3384,11 +3795,9 @@ pub fn Replica(
|
|
|
3384
3795
|
assert(header.invalid() == null);
|
|
3385
3796
|
assert(header.command == .prepare);
|
|
3386
3797
|
|
|
3387
|
-
if (latest.command == .reserved) {
|
|
3388
|
-
latest
|
|
3389
|
-
|
|
3390
|
-
latest.* = header;
|
|
3391
|
-
} else if (header.view == latest.view and header.op > latest.op) {
|
|
3798
|
+
if (latest.command == .reserved or header.op > latest.op) {
|
|
3799
|
+
// We are simply trying to find the latest `self.op` in the replica's log.
|
|
3800
|
+
// We therefore do not compare views here.
|
|
3392
3801
|
latest.* = header;
|
|
3393
3802
|
}
|
|
3394
3803
|
}
|
|
@@ -3400,18 +3809,15 @@ pub fn Replica(
|
|
|
3400
3809
|
k: u64,
|
|
3401
3810
|
method: []const u8,
|
|
3402
3811
|
) void {
|
|
3403
|
-
assert(self.status == .view_change
|
|
3812
|
+
assert(self.status == .view_change);
|
|
3404
3813
|
|
|
3405
3814
|
assert(latest.valid_checksum());
|
|
3406
3815
|
assert(latest.invalid() == null);
|
|
3407
3816
|
assert(latest.command == .prepare);
|
|
3408
3817
|
assert(latest.cluster == self.cluster);
|
|
3409
|
-
|
|
3410
|
-
|
|
3411
|
-
|
|
3412
|
-
assert(latest.view <= self.view);
|
|
3413
|
-
assert(self.view_jump_barrier);
|
|
3414
|
-
}
|
|
3818
|
+
|
|
3819
|
+
// The view may have started already, so we can have a prepare in the same view:
|
|
3820
|
+
assert(latest.view <= self.view);
|
|
3415
3821
|
|
|
3416
3822
|
log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={} offset={}", .{
|
|
3417
3823
|
self.replica,
|
|
@@ -3462,7 +3868,8 @@ pub fn Replica(
|
|
|
3462
3868
|
assert(self.op >= self.commit_max);
|
|
3463
3869
|
|
|
3464
3870
|
// Do not set the latest op as dirty if we already have it exactly:
|
|
3465
|
-
// Otherwise, this would trigger a repair and delay the view change
|
|
3871
|
+
// Otherwise, this would trigger a repair and delay the view change, or worse, it would
|
|
3872
|
+
// prevent us from assisting another replica to recover when we do in fact have the op.
|
|
3466
3873
|
if (self.journal.entry_for_op_exact_with_checksum(latest.op, latest.checksum)) |_| {
|
|
3467
3874
|
log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
|
|
3468
3875
|
} else {
|
|
@@ -3472,11 +3879,6 @@ pub fn Replica(
|
|
|
3472
3879
|
assert(self.op == latest.op);
|
|
3473
3880
|
self.journal.remove_entries_from(self.op + 1);
|
|
3474
3881
|
assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
|
|
3475
|
-
|
|
3476
|
-
if (self.view_jump_barrier) {
|
|
3477
|
-
self.view_jump_barrier = false;
|
|
3478
|
-
log.debug("{}: {s}: resolved view jump barrier", .{ self.replica, method });
|
|
3479
|
-
}
|
|
3480
3882
|
}
|
|
3481
3883
|
|
|
3482
3884
|
fn start_view_as_the_new_leader(self: *Self) void {
|
|
@@ -3484,9 +3886,6 @@ pub fn Replica(
|
|
|
3484
3886
|
assert(self.leader_index(self.view) == self.replica);
|
|
3485
3887
|
assert(self.do_view_change_quorum);
|
|
3486
3888
|
|
|
3487
|
-
// TODO Do one last count of our do_view_change quorum messages.
|
|
3488
|
-
|
|
3489
|
-
assert(!self.view_jump_barrier);
|
|
3490
3889
|
assert(!self.committing);
|
|
3491
3890
|
assert(!self.repairing_pipeline);
|
|
3492
3891
|
|
|
@@ -3514,7 +3913,7 @@ pub fn Replica(
|
|
|
3514
3913
|
assert(self.nack_prepare_op == null);
|
|
3515
3914
|
|
|
3516
3915
|
const start_view = self.create_view_change_message(.start_view) orelse {
|
|
3517
|
-
log.
|
|
3916
|
+
log.err("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
|
|
3518
3917
|
return;
|
|
3519
3918
|
};
|
|
3520
3919
|
defer self.message_bus.unref(start_view);
|
|
@@ -3544,6 +3943,7 @@ pub fn Replica(
|
|
|
3544
3943
|
// For example, this could happen after a state transfer triggered by an op jump.
|
|
3545
3944
|
assert(new_view >= self.view);
|
|
3546
3945
|
self.view = new_view;
|
|
3946
|
+
self.view_normal = new_view;
|
|
3547
3947
|
self.status = .normal;
|
|
3548
3948
|
|
|
3549
3949
|
if (self.leader()) {
|
|
@@ -3551,8 +3951,8 @@ pub fn Replica(
|
|
|
3551
3951
|
|
|
3552
3952
|
self.ping_timeout.start();
|
|
3553
3953
|
self.commit_timeout.start();
|
|
3554
|
-
self.
|
|
3555
|
-
self.
|
|
3954
|
+
self.normal_status_timeout.stop();
|
|
3955
|
+
self.view_change_status_timeout.stop();
|
|
3556
3956
|
self.view_change_message_timeout.stop();
|
|
3557
3957
|
self.repair_timeout.start();
|
|
3558
3958
|
|
|
@@ -3566,8 +3966,8 @@ pub fn Replica(
|
|
|
3566
3966
|
|
|
3567
3967
|
self.ping_timeout.start();
|
|
3568
3968
|
self.commit_timeout.stop();
|
|
3569
|
-
self.
|
|
3570
|
-
self.
|
|
3969
|
+
self.normal_status_timeout.start();
|
|
3970
|
+
self.view_change_status_timeout.stop();
|
|
3571
3971
|
self.view_change_message_timeout.stop();
|
|
3572
3972
|
self.repair_timeout.start();
|
|
3573
3973
|
|
|
@@ -3589,15 +3989,19 @@ pub fn Replica(
|
|
|
3589
3989
|
/// on its own timer, or because it receives a start_view_change or do_view_change message for
|
|
3590
3990
|
/// a view with a larger number than its own view.
|
|
3591
3991
|
fn transition_to_view_change_status(self: *Self, new_view: u32) void {
|
|
3592
|
-
log.debug("{}: transition_to_view_change_status: view={}", .{
|
|
3992
|
+
log.debug("{}: transition_to_view_change_status: view={}..{}", .{
|
|
3993
|
+
self.replica,
|
|
3994
|
+
self.view,
|
|
3995
|
+
new_view,
|
|
3996
|
+
});
|
|
3593
3997
|
assert(new_view > self.view);
|
|
3594
3998
|
self.view = new_view;
|
|
3595
3999
|
self.status = .view_change;
|
|
3596
4000
|
|
|
3597
4001
|
self.ping_timeout.stop();
|
|
3598
4002
|
self.commit_timeout.stop();
|
|
3599
|
-
self.
|
|
3600
|
-
self.
|
|
4003
|
+
self.normal_status_timeout.stop();
|
|
4004
|
+
self.view_change_status_timeout.start();
|
|
3601
4005
|
self.view_change_message_timeout.start();
|
|
3602
4006
|
self.repair_timeout.stop();
|
|
3603
4007
|
|
|
@@ -3673,16 +4077,6 @@ pub fn Replica(
|
|
|
3673
4077
|
/// Returns true if the hash chain is valid and up to date for the current view.
|
|
3674
4078
|
/// This is a stronger guarantee than `valid_hash_chain_between()` below.
|
|
3675
4079
|
fn valid_hash_chain(self: *Self, method: []const u8) bool {
|
|
3676
|
-
// If we know we have uncommitted ops that may have been reordered through a view change
|
|
3677
|
-
// then wait until the latest of these has been resolved with the leader:
|
|
3678
|
-
if (self.view_jump_barrier) {
|
|
3679
|
-
log.debug("{}: {s}: waiting to resolve view jump barrier", .{
|
|
3680
|
-
self.replica,
|
|
3681
|
-
method,
|
|
3682
|
-
});
|
|
3683
|
-
return false;
|
|
3684
|
-
}
|
|
3685
|
-
|
|
3686
4080
|
// If we know we could validate the hash chain even further, then wait until we can:
|
|
3687
4081
|
// This is partial defense-in-depth in case `self.op` is ever advanced by a reordered op.
|
|
3688
4082
|
if (self.op < self.commit_max) {
|
|
@@ -3721,7 +4115,7 @@ pub fn Replica(
|
|
|
3721
4115
|
if (self.journal.entry_for_op_exact(op)) |a| {
|
|
3722
4116
|
assert(a.op + 1 == b.op);
|
|
3723
4117
|
if (a.checksum == b.parent) {
|
|
3724
|
-
assert(
|
|
4118
|
+
assert(ascending_viewstamps(a, b));
|
|
3725
4119
|
b = a;
|
|
3726
4120
|
} else {
|
|
3727
4121
|
log.debug("{}: valid_hash_chain_between: break: A: {}", .{ self.replica, a });
|
|
@@ -3738,7 +4132,7 @@ pub fn Replica(
|
|
|
3738
4132
|
}
|
|
3739
4133
|
|
|
3740
4134
|
fn view_jump(self: *Self, header: *const Header) void {
|
|
3741
|
-
const
|
|
4135
|
+
const to: Status = switch (header.command) {
|
|
3742
4136
|
.prepare, .commit => .normal,
|
|
3743
4137
|
.start_view_change, .do_view_change, .start_view => .view_change,
|
|
3744
4138
|
else => unreachable,
|
|
@@ -3750,19 +4144,19 @@ pub fn Replica(
|
|
|
3750
4144
|
|
|
3751
4145
|
// Compare status transitions and decide whether to view jump or ignore:
|
|
3752
4146
|
switch (self.status) {
|
|
3753
|
-
.normal => switch (
|
|
4147
|
+
.normal => switch (to) {
|
|
3754
4148
|
// If the transition is to `.normal`, then ignore if for the same view:
|
|
3755
4149
|
.normal => if (header.view == self.view) return,
|
|
3756
4150
|
// If the transition is to `.view_change`, then ignore if the view has started:
|
|
3757
4151
|
.view_change => if (header.view == self.view) return,
|
|
3758
4152
|
else => unreachable,
|
|
3759
4153
|
},
|
|
3760
|
-
.view_change => switch (
|
|
4154
|
+
.view_change => switch (to) {
|
|
3761
4155
|
// This is an interesting special case:
|
|
3762
4156
|
// If the transition is to `.normal` in the same view, then we missed the
|
|
3763
4157
|
// `start_view` message and we must also consider this a view jump:
|
|
3764
|
-
// If we don't
|
|
3765
|
-
// will disrupt the cluster
|
|
4158
|
+
// If we don't handle this below then our `view_change_status_timeout` will fire
|
|
4159
|
+
// and we will disrupt the cluster with another view change for a newer view.
|
|
3766
4160
|
.normal => {},
|
|
3767
4161
|
// If the transition is to `.view_change`, then ignore if for the same view:
|
|
3768
4162
|
.view_change => if (header.view == self.view) return,
|
|
@@ -3771,84 +4165,39 @@ pub fn Replica(
|
|
|
3771
4165
|
else => unreachable,
|
|
3772
4166
|
}
|
|
3773
4167
|
|
|
3774
|
-
|
|
3775
|
-
|
|
3776
|
-
|
|
3777
|
-
|
|
3778
|
-
log.debug("{}: view_jump: exiting view change", .{self.replica});
|
|
3779
|
-
} else {
|
|
3780
|
-
assert(header.view > self.view);
|
|
3781
|
-
assert(self.status == .normal or self.status == .view_change);
|
|
3782
|
-
|
|
3783
|
-
log.debug("{}: view_jump: jumping into newer view", .{self.replica});
|
|
3784
|
-
}
|
|
3785
|
-
} else if (into == .view_change) {
|
|
3786
|
-
assert(header.view > self.view);
|
|
3787
|
-
assert(self.status == .normal or self.status == .view_change);
|
|
3788
|
-
|
|
3789
|
-
if (header.view == self.view + 1) {
|
|
3790
|
-
log.debug("{}: view_jump: jumping into view change", .{self.replica});
|
|
3791
|
-
} else {
|
|
3792
|
-
log.debug("{}: view_jump: jumping into next view change", .{self.replica});
|
|
3793
|
-
}
|
|
4168
|
+
switch (to) {
|
|
4169
|
+
.normal => {
|
|
4170
|
+
if (header.view == self.view) {
|
|
4171
|
+
assert(self.status == .view_change);
|
|
3794
4172
|
|
|
3795
|
-
|
|
3796
|
-
|
|
3797
|
-
|
|
3798
|
-
|
|
3799
|
-
//
|
|
3800
|
-
// However, that does not mean that we may clear any view jump barrier here, because
|
|
3801
|
-
// it may not be reimposed if we double-jump into a normal view with our op number
|
|
3802
|
-
// at that point older than the cluster commit number.
|
|
3803
|
-
//
|
|
3804
|
-
// Furthermore, even if we are transitioning from normal status into the very next
|
|
3805
|
-
// view through a view change, we must still impose a view jump barrier, because we
|
|
3806
|
-
// may never receive the start_view message, and because again, if we wait until the
|
|
3807
|
-
// next view jump then our op number may no longer exceed the cluster commit number.
|
|
3808
|
-
} else {
|
|
3809
|
-
unreachable;
|
|
3810
|
-
}
|
|
4173
|
+
log.debug("{}: view_jump: waiting to exit view change", .{self.replica});
|
|
4174
|
+
} else {
|
|
4175
|
+
assert(header.view > self.view);
|
|
4176
|
+
assert(self.status == .view_change or self.status == .normal);
|
|
3811
4177
|
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
// through any view change(s) in which we did/do not receive the start_view message.
|
|
3815
|
-
//
|
|
3816
|
-
// A commit number from the new leader may now refer to a different op than what we
|
|
3817
|
-
// have in our log, even if our hash chain is fully intact.
|
|
3818
|
-
//
|
|
3819
|
-
// CRITICAL: If we were to commit despite this ambiguity, we would fork the log.
|
|
3820
|
-
//
|
|
3821
|
-
// In Section 5.2, the VRR paper deals with this scenario by simply removing
|
|
3822
|
-
// the uncommitted ops and doing a state transfer.
|
|
3823
|
-
//
|
|
3824
|
-
// However, while strictly safe, this impairs safety in terms of durability, and
|
|
3825
|
-
// adds unnecessary repair overhead if it turns out that the ops were in fact
|
|
3826
|
-
// committed.
|
|
3827
|
-
//
|
|
3828
|
-
// We rather impose a view jump barrier to keep from committing, for as long as
|
|
3829
|
-
// there is ambiguity around what specific op a commit number represents.
|
|
3830
|
-
// This preserves and maximizes durability and minimizes repair traffic.
|
|
3831
|
-
//
|
|
3832
|
-
// This view jump barrier is cleared or resolved, respectively, as soon as:
|
|
3833
|
-
// 1. we receive a new prepare from the leader that advances our latest op, or
|
|
3834
|
-
// 2. we request and receive a `start_view` message from the leader of the view.
|
|
3835
|
-
//
|
|
3836
|
-
// This is safe because advancing our latest op in the current view or receiving
|
|
3837
|
-
// it from the leader ensures that we have the latest hash chain head, from which we
|
|
3838
|
-
// can work backwards to disambiguate any ops.
|
|
3839
|
-
log.debug("{}: view_jump: imposing view jump barrier", .{self.replica});
|
|
3840
|
-
self.view_jump_barrier = true;
|
|
3841
|
-
} else {
|
|
3842
|
-
assert(self.op <= self.commit_max);
|
|
4178
|
+
log.debug("{}: view_jump: waiting to jump to newer view", .{self.replica});
|
|
4179
|
+
}
|
|
3843
4180
|
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
4181
|
+
// TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
|
|
4182
|
+
log.debug("{}: view_jump: requesting start_view message", .{self.replica});
|
|
4183
|
+
self.send_header_to_replica(self.leader_index(header.view), .{
|
|
4184
|
+
.command = .request_start_view,
|
|
4185
|
+
.cluster = self.cluster,
|
|
4186
|
+
.replica = self.replica,
|
|
4187
|
+
.view = header.view,
|
|
4188
|
+
});
|
|
4189
|
+
},
|
|
4190
|
+
.view_change => {
|
|
4191
|
+
assert(header.view > self.view);
|
|
4192
|
+
assert(self.status == .view_change or self.status == .normal);
|
|
3848
4193
|
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
4194
|
+
if (header.view == self.view + 1) {
|
|
4195
|
+
log.debug("{}: view_jump: jumping to view change", .{self.replica});
|
|
4196
|
+
} else {
|
|
4197
|
+
log.debug("{}: view_jump: jumping to next view change", .{self.replica});
|
|
4198
|
+
}
|
|
4199
|
+
self.transition_to_view_change_status(header.view);
|
|
4200
|
+
},
|
|
3852
4201
|
else => unreachable,
|
|
3853
4202
|
}
|
|
3854
4203
|
}
|
|
@@ -3860,7 +4209,20 @@ pub fn Replica(
|
|
|
3860
4209
|
assert(message.header.op <= self.op);
|
|
3861
4210
|
|
|
3862
4211
|
if (!self.journal.has(message.header)) {
|
|
3863
|
-
log.debug("{}: write_prepare: ignoring (header changed)", .{
|
|
4212
|
+
log.debug("{}: write_prepare: ignoring op={} checksum={} (header changed)", .{
|
|
4213
|
+
self.replica,
|
|
4214
|
+
message.header.op,
|
|
4215
|
+
message.header.checksum,
|
|
4216
|
+
});
|
|
4217
|
+
return;
|
|
4218
|
+
}
|
|
4219
|
+
|
|
4220
|
+
if (self.journal.writing(message.header.op, message.header.checksum)) {
|
|
4221
|
+
log.debug("{}: write_prepare: ignoring op={} checksum={} (already writing)", .{
|
|
4222
|
+
self.replica,
|
|
4223
|
+
message.header.op,
|
|
4224
|
+
message.header.checksum,
|
|
4225
|
+
});
|
|
3864
4226
|
return;
|
|
3865
4227
|
}
|
|
3866
4228
|
|