tigerbeetle-node 0.3.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +21 -7
  2. package/dist/benchmark.js +1 -1
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/index.d.ts +22 -20
  5. package/dist/index.js +40 -18
  6. package/dist/index.js.map +1 -1
  7. package/dist/test.js +13 -1
  8. package/dist/test.js.map +1 -1
  9. package/package.json +12 -12
  10. package/scripts/postinstall.sh +2 -2
  11. package/src/benchmark.ts +4 -4
  12. package/src/index.ts +35 -9
  13. package/src/node.zig +139 -28
  14. package/src/test.ts +19 -5
  15. package/src/tigerbeetle/scripts/benchmark.sh +10 -3
  16. package/src/tigerbeetle/scripts/install.sh +2 -2
  17. package/src/tigerbeetle/scripts/install_zig.bat +109 -0
  18. package/src/tigerbeetle/scripts/install_zig.sh +21 -4
  19. package/src/tigerbeetle/scripts/vopr.bat +48 -0
  20. package/src/tigerbeetle/scripts/vopr.sh +33 -0
  21. package/src/tigerbeetle/src/benchmark.zig +74 -42
  22. package/src/tigerbeetle/src/cli.zig +136 -83
  23. package/src/tigerbeetle/src/config.zig +80 -26
  24. package/src/tigerbeetle/src/demo.zig +101 -78
  25. package/src/tigerbeetle/src/demo_01_create_accounts.zig +2 -7
  26. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +2 -7
  27. package/src/tigerbeetle/src/demo_03_create_transfers.zig +2 -7
  28. package/src/tigerbeetle/src/demo_04_create_transfers_two_phase_commit.zig +2 -5
  29. package/src/tigerbeetle/src/demo_05_accept_transfers.zig +2 -7
  30. package/src/tigerbeetle/src/demo_06_reject_transfers.zig +2 -7
  31. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +8 -0
  32. package/src/tigerbeetle/src/fifo.zig +20 -11
  33. package/src/tigerbeetle/src/io.zig +35 -22
  34. package/src/tigerbeetle/src/io_darwin.zig +701 -0
  35. package/src/tigerbeetle/src/main.zig +72 -25
  36. package/src/tigerbeetle/src/message_bus.zig +379 -456
  37. package/src/tigerbeetle/src/message_pool.zig +3 -3
  38. package/src/tigerbeetle/src/ring_buffer.zig +192 -37
  39. package/src/tigerbeetle/src/simulator.zig +317 -0
  40. package/src/tigerbeetle/src/state_machine.zig +846 -38
  41. package/src/tigerbeetle/src/storage.zig +488 -90
  42. package/src/tigerbeetle/src/test/cluster.zig +221 -0
  43. package/src/tigerbeetle/src/test/message_bus.zig +92 -0
  44. package/src/tigerbeetle/src/test/network.zig +182 -0
  45. package/src/tigerbeetle/src/test/packet_simulator.zig +371 -0
  46. package/src/tigerbeetle/src/test/state_checker.zig +142 -0
  47. package/src/tigerbeetle/src/test/state_machine.zig +71 -0
  48. package/src/tigerbeetle/src/test/storage.zig +375 -0
  49. package/src/tigerbeetle/src/test/time.zig +84 -0
  50. package/src/tigerbeetle/src/tigerbeetle.zig +6 -3
  51. package/src/tigerbeetle/src/time.zig +65 -0
  52. package/src/tigerbeetle/src/unit_tests.zig +14 -0
  53. package/src/tigerbeetle/src/vsr/client.zig +519 -0
  54. package/src/tigerbeetle/src/vsr/clock.zig +829 -0
  55. package/src/tigerbeetle/src/vsr/journal.zig +1368 -0
  56. package/src/tigerbeetle/src/vsr/marzullo.zig +306 -0
  57. package/src/tigerbeetle/src/vsr/replica.zig +4248 -0
  58. package/src/tigerbeetle/src/vsr.zig +601 -0
  59. package/src/tigerbeetle/LICENSE +0 -177
  60. package/src/tigerbeetle/README.md +0 -116
  61. package/src/tigerbeetle/src/client.zig +0 -319
  62. package/src/tigerbeetle/src/concurrent_ranges.zig +0 -162
  63. package/src/tigerbeetle/src/fixed_array_list.zig +0 -53
  64. package/src/tigerbeetle/src/io_async.zig +0 -600
  65. package/src/tigerbeetle/src/journal.zig +0 -567
  66. package/src/tigerbeetle/src/test_client.zig +0 -41
  67. package/src/tigerbeetle/src/test_main.zig +0 -118
  68. package/src/tigerbeetle/src/test_message_bus.zig +0 -132
  69. package/src/tigerbeetle/src/vr/journal.zig +0 -672
  70. package/src/tigerbeetle/src/vr/replica.zig +0 -3061
  71. package/src/tigerbeetle/src/vr.zig +0 -374
@@ -0,0 +1,4248 @@
1
+ const std = @import("std");
2
+ const Allocator = std.mem.Allocator;
3
+ const assert = std.debug.assert;
4
+
5
+ const config = @import("../config.zig");
6
+
7
+ const Message = @import("../message_pool.zig").MessagePool.Message;
8
+ const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
9
+
10
+ const vsr = @import("../vsr.zig");
11
+ const Header = vsr.Header;
12
+ const Timeout = vsr.Timeout;
13
+ const Command = vsr.Command;
14
+ const Version = vsr.Version;
15
+
16
+ const log = std.log.scoped(.replica);
17
+
18
+ pub const Status = enum {
19
+ normal,
20
+ view_change,
21
+ recovering,
22
+ };
23
+
24
+ const ClientTable = std.AutoHashMapUnmanaged(u128, ClientTableEntry);
25
+
26
+ /// We found two bugs in the VRR paper relating to the client table:
27
+ ///
28
+ /// 1. a correctness bug, where successive client crashes may cause request numbers to collide for
29
+ /// different request payloads, resulting in requests receiving the wrong reply, and
30
+ ///
31
+ /// 2. a liveness bug, where if the client table is updated for request and prepare messages with
32
+ /// the client's latest request number, then the client may be locked out from the cluster if the
33
+ /// request is ever reordered through a view change.
34
+ ///
35
+ /// We therefore take a different approach with the implementation of our client table, to:
36
+ ///
37
+ /// 1. register client sessions explicitly through the state machine to ensure that client session
38
+ /// numbers always increase, and
39
+ ///
40
+ /// 2. make a more careful distinction between uncommitted and committed request numbers,
41
+ /// considering that uncommitted requests may not survive a view change.
42
+ const ClientTableEntry = struct {
43
+ /// The client's session number as committed to the cluster by a register request.
44
+ session: u64,
45
+
46
+ /// The reply sent to the client's latest committed request.
47
+ reply: *Message,
48
+ };
49
+
50
+ const Prepare = struct {
51
+ /// The current prepare message (used to cross-check prepare_ok messages, and for resending).
52
+ message: *Message,
53
+
54
+ /// Unique prepare_ok messages for the same view, op number and checksum from ALL replicas.
55
+ ok_from_all_replicas: QuorumMessages = QuorumMessagesNull,
56
+
57
+ /// Whether a quorum of prepare_ok messages has been received for this prepare.
58
+ ok_quorum_received: bool = false,
59
+ };
60
+
61
+ const QuorumMessages = [config.replicas_max]?*Message;
62
+ const QuorumMessagesNull = [_]?*Message{null} ** config.replicas_max;
63
+
64
+ pub fn Replica(
65
+ comptime StateMachine: type,
66
+ comptime MessageBus: type,
67
+ comptime Storage: type,
68
+ comptime Time: type,
69
+ ) type {
70
+ return struct {
71
+ const Self = @This();
72
+
73
+ const Journal = vsr.Journal(Self, Storage);
74
+ const Clock = vsr.Clock(Time);
75
+
76
+ /// The number of the cluster to which this replica belongs:
77
+ cluster: u32,
78
+
79
+ /// The number of replicas in the cluster:
80
+ replica_count: u8,
81
+
82
+ /// The index of this replica's address in the configuration array held by the MessageBus:
83
+ replica: u8,
84
+
85
+ /// The minimum number of replicas required to form a replication quorum:
86
+ quorum_replication: u8,
87
+
88
+ /// The minimum number of replicas required to form a view change quorum:
89
+ quorum_view_change: u8,
90
+
91
+ /// A distributed fault-tolerant clock for lower and upper bounds on the leader's wall clock:
92
+ clock: Clock,
93
+
94
+ /// The persistent log of hash-chained journal entries:
95
+ journal: Journal,
96
+
97
+ /// An abstraction to send messages from the replica to another replica or client.
98
+ /// The message bus will also deliver messages to this replica by calling `on_message()`.
99
+ message_bus: *MessageBus,
100
+
101
+ /// For executing service up-calls after an operation has been committed:
102
+ state_machine: *StateMachine,
103
+
104
+ /// The client table records for each client the latest session and the latest committed reply.
105
+ client_table: ClientTable,
106
+
107
+ /// The current view, initially 0:
108
+ view: u32,
109
+
110
+ /// The latest view, in which the replica's status was normal.
111
+ view_normal: u32,
112
+
113
+ /// The current status, either normal, view_change, or recovering:
114
+ /// TODO Don't default to normal, set the starting status according to the journal's health.
115
+ status: Status = .normal,
116
+
117
+ /// The op number assigned to the most recently prepared operation:
118
+ op: u64,
119
+
120
+ /// The op number of the latest committed and executed operation (according to the replica):
121
+ /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
122
+ commit_min: u64,
123
+
124
+ /// The op number of the latest committed operation (according to the cluster):
125
+ /// This is the commit number in terms of the VRR paper.
126
+ commit_max: u64,
127
+
128
+ /// Whether we are reading a prepare from storage in order to commit.
129
+ committing: bool = false,
130
+
131
+ /// Whether we are reading a prepare from storage in order to push to the pipeline.
132
+ repairing_pipeline: bool = false,
133
+
134
+ /// The leader's pipeline of inflight prepares waiting to commit in FIFO order.
135
+ /// This allows us to pipeline without the complexity of out-of-order commits.
136
+ pipeline: RingBuffer(Prepare, config.pipelining_max) = .{},
137
+
138
+ /// In some cases, a replica may send a message to itself. We do not submit these messages
139
+ /// to the message bus but rather queue them here for guaranteed immediate delivery, which
140
+ /// we require and assert in our protocol implementation.
141
+ loopback_queue: ?*Message = null,
142
+
143
+ /// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
144
+ start_view_change_from_other_replicas: QuorumMessages = QuorumMessagesNull,
145
+
146
+ /// Unique do_view_change messages for the same view from ALL replicas (including ourself).
147
+ do_view_change_from_all_replicas: QuorumMessages = QuorumMessagesNull,
148
+
149
+ /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
150
+ nack_prepare_from_other_replicas: QuorumMessages = QuorumMessagesNull,
151
+
152
+ /// Whether a replica has received a quorum of start_view_change messages for the view change:
153
+ start_view_change_quorum: bool = false,
154
+
155
+ /// Whether the leader has received a quorum of do_view_change messages for the view change:
156
+ /// Determines whether the leader may effect repairs according to the CTRL protocol.
157
+ do_view_change_quorum: bool = false,
158
+
159
+ /// Whether the leader is expecting to receive a nack_prepare and for which op:
160
+ nack_prepare_op: ?u64 = null,
161
+
162
+ /// The number of ticks before a leader or follower broadcasts a ping to other replicas.
163
+ /// TODO Explain why we need this (MessageBus handshaking, leapfrogging faulty replicas,
164
+ /// deciding whether starting a view change would be detrimental under some network partitions).
165
+ ping_timeout: Timeout,
166
+
167
+ /// The number of ticks without enough prepare_ok's before the leader resends a prepare.
168
+ prepare_timeout: Timeout,
169
+
170
+ /// The number of ticks before the leader sends a commit heartbeat:
171
+ /// The leader always sends a commit heartbeat irrespective of when it last sent a prepare.
172
+ /// This improves liveness when prepare messages cannot be replicated fully due to partitions.
173
+ commit_timeout: Timeout,
174
+
175
+ /// The number of ticks without hearing from the leader before starting a view change.
176
+ /// This transitions from .normal status to .view_change status.
177
+ normal_status_timeout: Timeout,
178
+
179
+ /// The number of ticks before a view change is timed out:
180
+ /// This transitions from `view_change` status to `view_change` status but for a newer view.
181
+ view_change_status_timeout: Timeout,
182
+
183
+ /// The number of ticks before resending a `start_view_change` or `do_view_change` message:
184
+ view_change_message_timeout: Timeout,
185
+
186
+ /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
187
+ repair_timeout: Timeout,
188
+
189
+ /// Used to provide deterministic entropy to `choose_any_other_replica()`.
190
+ /// Incremented whenever `choose_any_other_replica()` is called.
191
+ choose_any_other_replica_ticks: u64 = 0,
192
+
193
+ /// Used to calculate exponential backoff with random jitter.
194
+ /// Seeded with the replica's index number.
195
+ prng: std.rand.DefaultPrng,
196
+
197
+ on_change_state: ?fn (replica: *Self) void = null,
198
+
199
+ pub fn init(
200
+ allocator: *Allocator,
201
+ cluster: u32,
202
+ replica_count: u8,
203
+ replica: u8,
204
+ time: *Time,
205
+ storage: *Storage,
206
+ message_bus: *MessageBus,
207
+ state_machine: *StateMachine,
208
+ ) !Self {
209
+ assert(replica_count > 0);
210
+ assert(replica < replica_count);
211
+
212
+ const majority = (replica_count / 2) + 1;
213
+ assert(majority <= replica_count);
214
+
215
+ assert(config.quorum_replication_max >= 2);
216
+ const quorum_replication = std.math.min(config.quorum_replication_max, majority);
217
+ assert(quorum_replication >= 2 or quorum_replication == replica_count);
218
+
219
+ const quorum_view_change = std.math.max(
220
+ replica_count - quorum_replication + 1,
221
+ majority,
222
+ );
223
+ // The view change quorum may be more expensive to make the replication quorum cheaper.
224
+ // The insight is that the replication phase is by far more common than the view change.
225
+ // This trade-off allows us to optimize for the common case.
226
+ // See the comments in `config.zig` for further explanation.
227
+ assert(quorum_view_change >= majority);
228
+
229
+ if (replica_count <= 2) {
230
+ assert(quorum_replication == replica_count);
231
+ assert(quorum_view_change == replica_count);
232
+ } else {
233
+ assert(quorum_replication < replica_count);
234
+ assert(quorum_view_change < replica_count);
235
+ }
236
+
237
+ // Flexible quorums are safe if these two quorums intersect so that this relation holds:
238
+ assert(quorum_replication + quorum_view_change > replica_count);
239
+
240
+ var client_table: ClientTable = .{};
241
+ errdefer client_table.deinit(allocator);
242
+ try client_table.ensureCapacity(allocator, @intCast(u32, config.clients_max));
243
+ assert(client_table.capacity() >= config.clients_max);
244
+
245
+ var init_prepare = Header{
246
+ .parent = 0,
247
+ .client = 0,
248
+ .context = 0,
249
+ .request = 0,
250
+ .cluster = cluster,
251
+ .epoch = 0,
252
+ .view = 0,
253
+ .op = 0,
254
+ .commit = 0,
255
+ .offset = 0,
256
+ .size = @sizeOf(Header),
257
+ .replica = 0,
258
+ .command = .prepare,
259
+ .operation = .init,
260
+ .version = Version,
261
+ };
262
+ init_prepare.set_checksum_body(&[0]u8{});
263
+ init_prepare.set_checksum();
264
+
265
+ var self = Self{
266
+ .cluster = cluster,
267
+ .replica_count = replica_count,
268
+ .replica = replica,
269
+ .quorum_replication = quorum_replication,
270
+ .quorum_view_change = quorum_view_change,
271
+ .clock = try Clock.init(
272
+ allocator,
273
+ replica_count,
274
+ replica,
275
+ time,
276
+ ),
277
+ .journal = try Journal.init(
278
+ allocator,
279
+ storage,
280
+ replica,
281
+ config.journal_size_max,
282
+ config.journal_headers_max,
283
+ &init_prepare,
284
+ ),
285
+ .message_bus = message_bus,
286
+ .state_machine = state_machine,
287
+ .client_table = client_table,
288
+ .view = init_prepare.view,
289
+ .view_normal = init_prepare.view,
290
+ .op = init_prepare.op,
291
+ .commit_min = init_prepare.commit,
292
+ .commit_max = init_prepare.commit,
293
+ .ping_timeout = Timeout{
294
+ .name = "ping_timeout",
295
+ .id = replica,
296
+ .after = 100,
297
+ },
298
+ .prepare_timeout = Timeout{
299
+ .name = "prepare_timeout",
300
+ .id = replica,
301
+ .after = 50,
302
+ },
303
+ .commit_timeout = Timeout{
304
+ .name = "commit_timeout",
305
+ .id = replica,
306
+ .after = 100,
307
+ },
308
+ .normal_status_timeout = Timeout{
309
+ .name = "normal_status_timeout",
310
+ .id = replica,
311
+ .after = 500,
312
+ },
313
+ .view_change_status_timeout = Timeout{
314
+ .name = "view_change_status_timeout",
315
+ .id = replica,
316
+ .after = 500,
317
+ },
318
+ .view_change_message_timeout = Timeout{
319
+ .name = "view_change_message_timeout",
320
+ .id = replica,
321
+ .after = 50,
322
+ },
323
+ .repair_timeout = Timeout{
324
+ .name = "repair_timeout",
325
+ .id = replica,
326
+ .after = 50,
327
+ },
328
+ .prng = std.rand.DefaultPrng.init(replica),
329
+ };
330
+
331
+ log.debug("{}: init: replica_count={} quorum_view_change={} quorum_replication={}", .{
332
+ self.replica,
333
+ self.replica_count,
334
+ self.quorum_view_change,
335
+ self.quorum_replication,
336
+ });
337
+
338
+ // To reduce the probability of clustering, for efficient linear probing, the hash map will
339
+ // always overallocate capacity by a factor of two.
340
+ log.debug("{}: init: client_table.capacity()={} for config.clients_max={} entries", .{
341
+ self.replica,
342
+ self.client_table.capacity(),
343
+ config.clients_max,
344
+ });
345
+
346
+ // We must initialize timeouts here, not in tick() on the first tick, because on_message()
347
+ // can race with tick()... before timeouts have been initialized:
348
+ assert(self.status == .normal);
349
+ if (self.leader()) {
350
+ log.debug("{}: init: leader", .{self.replica});
351
+ self.ping_timeout.start();
352
+ self.commit_timeout.start();
353
+ self.repair_timeout.start();
354
+ } else {
355
+ log.debug("{}: init: follower", .{self.replica});
356
+ self.ping_timeout.start();
357
+ self.normal_status_timeout.start();
358
+ self.repair_timeout.start();
359
+ }
360
+
361
+ return self;
362
+ }
363
+
364
+ /// Free all memory and unref all messages held by the replica
365
+ /// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
366
+ pub fn deinit(self: *Self, allocator: *Allocator) void {
367
+ self.journal.deinit(allocator);
368
+ self.clock.deinit(allocator);
369
+
370
+ {
371
+ var it = self.client_table.iterator();
372
+ while (it.next()) |entry| {
373
+ self.message_bus.unref(entry.value_ptr.reply);
374
+ }
375
+ self.client_table.deinit(allocator);
376
+ }
377
+
378
+ {
379
+ var it = self.pipeline.iterator();
380
+ while (it.next()) |prepare| {
381
+ self.message_bus.unref(prepare.message);
382
+ for (prepare.ok_from_all_replicas) |message| {
383
+ if (message) |m| self.message_bus.unref(m);
384
+ }
385
+ }
386
+ }
387
+
388
+ if (self.loopback_queue) |loopback_message| {
389
+ assert(loopback_message.next == null);
390
+ self.message_bus.unref(loopback_message);
391
+ self.loopback_queue = null;
392
+ }
393
+
394
+ for (self.start_view_change_from_other_replicas) |message| {
395
+ if (message) |m| self.message_bus.unref(m);
396
+ }
397
+ for (self.do_view_change_from_all_replicas) |message| {
398
+ if (message) |m| self.message_bus.unref(m);
399
+ }
400
+ for (self.nack_prepare_from_other_replicas) |message| {
401
+ if (message) |m| self.message_bus.unref(m);
402
+ }
403
+ }
404
+
405
+ /// Time is measured in logical ticks that are incremented on every call to tick().
406
+ /// This eliminates a dependency on the system time and enables deterministic testing.
407
+ pub fn tick(self: *Self) void {
408
+ // Ensure that all asynchronous IO callbacks flushed the loopback queue as needed.
409
+ // If an IO callback queues a loopback message without flushing the queue then this will
410
+ // delay the delivery of messages (e.g. a prepare_ok from the leader to itself) and
411
+ // decrease throughput significantly.
412
+ assert(self.loopback_queue == null);
413
+
414
+ self.clock.tick();
415
+
416
+ if (!self.journal.recovered) {
417
+ self.journal.recover();
418
+ return;
419
+ } else {
420
+ assert(!self.journal.recovering);
421
+ }
422
+
423
+ self.ping_timeout.tick();
424
+ self.prepare_timeout.tick();
425
+ self.commit_timeout.tick();
426
+ self.normal_status_timeout.tick();
427
+ self.view_change_status_timeout.tick();
428
+ self.view_change_message_timeout.tick();
429
+ self.repair_timeout.tick();
430
+
431
+ if (self.ping_timeout.fired()) self.on_ping_timeout();
432
+ if (self.prepare_timeout.fired()) self.on_prepare_timeout();
433
+ if (self.commit_timeout.fired()) self.on_commit_timeout();
434
+ if (self.normal_status_timeout.fired()) self.on_normal_status_timeout();
435
+ if (self.view_change_status_timeout.fired()) self.on_view_change_status_timeout();
436
+ if (self.view_change_message_timeout.fired()) self.on_view_change_message_timeout();
437
+ if (self.repair_timeout.fired()) self.on_repair_timeout();
438
+
439
+ // None of the on_timeout() functions above should send a message to this replica.
440
+ assert(self.loopback_queue == null);
441
+ }
442
+
443
+ /// Called by the MessageBus to deliver a message to the replica.
444
+ pub fn on_message(self: *Self, message: *Message) void {
445
+ assert(self.loopback_queue == null);
446
+
447
+ log.debug("{}: on_message: view={} status={s} {}", .{
448
+ self.replica,
449
+ self.view,
450
+ @tagName(self.status),
451
+ message.header,
452
+ });
453
+
454
+ if (message.header.invalid()) |reason| {
455
+ log.alert("{}: on_message: invalid ({s})", .{ self.replica, reason });
456
+ return;
457
+ }
458
+
459
+ // No client or replica should ever send a .reserved message.
460
+ assert(message.header.command != .reserved);
461
+
462
+ if (message.header.cluster != self.cluster) {
463
+ log.warn("{}: on_message: wrong cluster (cluster must be {} not {})", .{
464
+ self.replica,
465
+ self.cluster,
466
+ message.header.cluster,
467
+ });
468
+ return;
469
+ }
470
+
471
+ if (!self.journal.recovered) {
472
+ self.journal.recover();
473
+ log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
474
+ return;
475
+ } else {
476
+ assert(!self.journal.recovering);
477
+ }
478
+
479
+ assert(message.header.replica < self.replica_count);
480
+ switch (message.header.command) {
481
+ .ping => self.on_ping(message),
482
+ .pong => self.on_pong(message),
483
+ .request => self.on_request(message),
484
+ .prepare => self.on_prepare(message),
485
+ .prepare_ok => self.on_prepare_ok(message),
486
+ .commit => self.on_commit(message),
487
+ .start_view_change => self.on_start_view_change(message),
488
+ .do_view_change => self.on_do_view_change(message),
489
+ .start_view => self.on_start_view(message),
490
+ .recovery => self.on_recovery(message),
491
+ .recovery_response => return, // TODO
492
+ .request_start_view => self.on_request_start_view(message),
493
+ .request_prepare => self.on_request_prepare(message),
494
+ .request_headers => self.on_request_headers(message),
495
+ .headers => self.on_headers(message),
496
+ .nack_prepare => self.on_nack_prepare(message),
497
+ // A replica should never handle misdirected messages intended for a client:
498
+ .eviction, .reply => {
499
+ log.warn("{}: on_message: ignoring misdirected {s} message", .{
500
+ self.replica,
501
+ @tagName(message.header.command),
502
+ });
503
+ return;
504
+ },
505
+ .reserved => unreachable,
506
+ }
507
+
508
+ if (self.loopback_queue) |loopback_message| {
509
+ log.emerg("{}: on_message: on_{s}() queued a {s} loopback message with no flush", .{
510
+ self.replica,
511
+ @tagName(message.header.command),
512
+ @tagName(loopback_message.header.command),
513
+ });
514
+ }
515
+
516
+ // Any message handlers that loopback must take responsibility for the flush.
517
+ assert(self.loopback_queue == null);
518
+ }
519
+
520
+ fn on_ping(self: *Self, message: *const Message) void {
521
+ if (self.status != .normal and self.status != .view_change) return;
522
+
523
+ assert(self.status == .normal or self.status == .view_change);
524
+
525
+ // TODO Drop pings that were not addressed to us.
526
+
527
+ var pong = Header{
528
+ .command = .pong,
529
+ .cluster = self.cluster,
530
+ .replica = self.replica,
531
+ .view = self.view,
532
+ };
533
+
534
+ if (message.header.client > 0) {
535
+ assert(message.header.replica == 0);
536
+
537
+ // We must only ever send our view number to a client via a pong message if we are
538
+ // in normal status. Otherwise, we may be partitioned from the cluster with a newer
539
+ // view number, leak this to the client, which would then pass this to the cluster
540
+ // in subsequent client requests, which would then ignore these client requests with
541
+ // a newer view number, locking out the client. The principle here is that we must
542
+ // never send view numbers for views that have not yet started.
543
+ if (self.status == .normal) {
544
+ self.send_header_to_client(message.header.client, pong);
545
+ }
546
+ } else if (message.header.replica == self.replica) {
547
+ log.warn("{}: on_ping: ignoring (self)", .{self.replica});
548
+ } else {
549
+ // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
550
+ pong.op = message.header.op;
551
+ pong.offset = @bitCast(u64, self.clock.realtime());
552
+ self.send_header_to_replica(message.header.replica, pong);
553
+ }
554
+ }
555
+
556
+ fn on_pong(self: *Self, message: *const Message) void {
557
+ if (message.header.client > 0) return;
558
+ if (message.header.replica == self.replica) return;
559
+
560
+ const m0 = message.header.op;
561
+ const t1 = @bitCast(i64, message.header.offset);
562
+ const m2 = self.clock.monotonic();
563
+
564
+ self.clock.learn(message.header.replica, m0, t1, m2);
565
+ }
566
+
567
+ /// The primary advances op-number, adds the request to the end of the log, and updates the
568
+ /// information for this client in the client-table to contain the new request number, s.
569
+ /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the current
570
+ /// view-number, m is the message it received from the client, n is the op-number it assigned to
571
+ /// the request, and k is the commit-number.
572
+ fn on_request(self: *Self, message: *Message) void {
573
+ if (self.ignore_request_message(message)) return;
574
+
575
+ assert(self.status == .normal);
576
+ assert(self.leader());
577
+ assert(self.commit_min == self.commit_max);
578
+ assert(self.commit_max + self.pipeline.count == self.op);
579
+
580
+ assert(message.header.command == .request);
581
+ assert(message.header.view <= self.view); // The client's view may be behind ours.
582
+
583
+ const realtime = self.clock.realtime_synchronized() orelse {
584
+ log.alert("{}: on_request: dropping (clock not synchronized)", .{self.replica});
585
+ return;
586
+ };
587
+
588
+ log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
589
+
590
+ self.state_machine.prepare(
591
+ realtime,
592
+ message.header.operation.cast(StateMachine),
593
+ message.body(),
594
+ );
595
+
596
+ var latest_entry = self.journal.entry_for_op_exact(self.op).?;
597
+ message.header.parent = latest_entry.checksum;
598
+ message.header.context = message.header.checksum;
599
+ message.header.view = self.view;
600
+ message.header.op = self.op + 1;
601
+ message.header.commit = self.commit_max;
602
+ message.header.offset = self.journal.next_offset(latest_entry);
603
+ message.header.replica = self.replica;
604
+ message.header.command = .prepare;
605
+
606
+ message.header.set_checksum_body(message.body());
607
+ message.header.set_checksum();
608
+
609
+ log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
610
+
611
+ self.pipeline.push(.{ .message = message.ref() }) catch unreachable;
612
+ assert(self.pipeline.count >= 1);
613
+
614
+ if (self.pipeline.count == 1) {
615
+ // This is the only prepare in the pipeline, start the timeout:
616
+ assert(!self.prepare_timeout.ticking);
617
+ self.prepare_timeout.start();
618
+ } else {
619
+ // Do not restart the prepare timeout as it is already ticking for another prepare.
620
+ assert(self.prepare_timeout.ticking);
621
+ }
622
+
623
+ self.on_prepare(message);
624
+
625
+ // We expect `on_prepare()` to increment `self.op` to match the leader's latest prepare:
626
+ // This is critical to ensure that pipelined prepares do not receive the same op number.
627
+ assert(self.op == message.header.op);
628
+ }
629
+
630
+ /// Replication is simple, with a single code path for the leader and followers.
631
+ ///
632
+ /// The leader starts by sending a prepare message to itself.
633
+ ///
634
+ /// Each replica (including the leader) then forwards this prepare message to the next replica
635
+ /// in the configuration, in parallel to writing to its own journal, closing the circle until
636
+ /// the next replica is back to the leader, in which case the replica does not forward.
637
+ ///
638
+ /// This keeps the leader's outgoing bandwidth limited (one-for-one) to incoming bandwidth,
639
+ /// since the leader need only replicate to the next replica. Otherwise, the leader would need
640
+ /// to replicate to multiple followers, dividing available bandwidth.
641
+ ///
642
+ /// This does not impact latency, since with Flexible Paxos we need only one remote prepare_ok.
643
+ /// It is ideal if this synchronous replication to one remote replica is to the next replica,
644
+ /// since that is the replica next in line to be leader, which will need to be up-to-date before
645
+ /// it can start the next view.
646
+ ///
647
+ /// At the same time, asynchronous replication keeps going, so that if our local disk is slow,
648
+ /// then any latency spike will be masked by more remote prepare_ok messages as they come in.
649
+ /// This gives automatic tail latency tolerance for storage latency spikes.
650
+ ///
651
+ /// The remaining problem then is tail latency tolerance for network latency spikes.
652
+ /// If the next replica is down or partitioned, then the leader's prepare timeout will fire,
653
+ /// and the leader will resend but to another replica, until it receives enough prepare_ok's.
654
+ fn on_prepare(self: *Self, message: *Message) void {
655
+ self.view_jump(message.header);
656
+
657
+ if (self.is_repair(message)) {
658
+ log.debug("{}: on_prepare: ignoring (repair)", .{self.replica});
659
+ self.on_repair(message);
660
+ return;
661
+ }
662
+
663
+ if (self.status != .normal) {
664
+ log.debug("{}: on_prepare: ignoring ({})", .{ self.replica, self.status });
665
+ return;
666
+ }
667
+
668
+ if (message.header.view < self.view) {
669
+ log.debug("{}: on_prepare: ignoring (older view)", .{self.replica});
670
+ return;
671
+ }
672
+
673
+ if (message.header.view > self.view) {
674
+ log.debug("{}: on_prepare: ignoring (newer view)", .{self.replica});
675
+ return;
676
+ }
677
+
678
+ assert(self.status == .normal);
679
+ assert(message.header.view == self.view);
680
+ assert(self.leader() or self.follower());
681
+ assert(message.header.replica == self.leader_index(message.header.view));
682
+ assert(message.header.op > self.op);
683
+ assert(message.header.op > self.commit_min);
684
+
685
+ if (self.follower()) self.normal_status_timeout.reset();
686
+
687
+ if (message.header.op > self.op + 1) {
688
+ log.debug("{}: on_prepare: newer op", .{self.replica});
689
+ self.jump_to_newer_op_in_normal_status(message.header);
690
+ }
691
+
692
+ if (self.journal.previous_entry(message.header)) |previous| {
693
+ // Any previous entry may be a whole journal's worth of ops behind due to wrapping.
694
+ // We therefore do not do any further op, offset or checksum assertions beyond this:
695
+ self.panic_if_hash_chain_would_break_in_the_same_view(previous, message.header);
696
+ }
697
+
698
+ // We must advance our op and set the header as dirty before replicating and journalling.
699
+ // The leader needs this before its journal is outrun by any prepare_ok quorum:
700
+ log.debug("{}: on_prepare: advancing: op={}..{} checksum={}..{}", .{
701
+ self.replica,
702
+ self.op,
703
+ message.header.op,
704
+ message.header.parent,
705
+ message.header.checksum,
706
+ });
707
+ assert(message.header.op == self.op + 1);
708
+ self.op = message.header.op;
709
+ self.journal.set_entry_as_dirty(message.header);
710
+
711
+ self.replicate(message);
712
+ self.append(message);
713
+
714
+ if (self.follower()) {
715
+ // A prepare may already be committed if requested by repair() so take the max:
716
+ self.commit_ops(std.math.max(message.header.commit, self.commit_max));
717
+ assert(self.commit_max >= message.header.commit);
718
+ }
719
+ }
720
+
721
+ fn on_prepare_ok(self: *Self, message: *Message) void {
722
+ if (self.ignore_prepare_ok(message)) return;
723
+
724
+ assert(self.status == .normal);
725
+ assert(message.header.view == self.view);
726
+ assert(self.leader());
727
+
728
+ const prepare = self.pipeline_prepare_for_prepare_ok(message) orelse return;
729
+
730
+ assert(prepare.message.header.checksum == message.header.context);
731
+ assert(prepare.message.header.op >= self.commit_max + 1);
732
+ assert(prepare.message.header.op <= self.commit_max + self.pipeline.count);
733
+ assert(prepare.message.header.op <= self.op);
734
+
735
+ // Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
736
+ const threshold = self.quorum_replication;
737
+
738
+ const count = self.add_message_and_receive_quorum_exactly_once(
739
+ &prepare.ok_from_all_replicas,
740
+ message,
741
+ threshold,
742
+ ) orelse return;
743
+
744
+ assert(count == threshold);
745
+ assert(!prepare.ok_quorum_received);
746
+ prepare.ok_quorum_received = true;
747
+
748
+ log.debug("{}: on_prepare_ok: quorum received, context={}", .{
749
+ self.replica,
750
+ prepare.message.header.checksum,
751
+ });
752
+
753
+ self.commit_pipeline();
754
+ }
755
+
756
+ /// Known issue:
757
+ /// TODO The leader should stand down if it sees too many retries in on_prepare_timeout().
758
+ /// It's possible for the network to be one-way partitioned so that followers don't see the
759
+ /// leader as down, but neither can the leader hear from the followers.
760
+ fn on_commit(self: *Self, message: *const Message) void {
761
+ self.view_jump(message.header);
762
+
763
+ if (self.status != .normal) {
764
+ log.debug("{}: on_commit: ignoring ({})", .{ self.replica, self.status });
765
+ return;
766
+ }
767
+
768
+ if (message.header.view < self.view) {
769
+ log.debug("{}: on_commit: ignoring (older view)", .{self.replica});
770
+ return;
771
+ }
772
+
773
+ if (message.header.view > self.view) {
774
+ log.debug("{}: on_commit: ignoring (newer view)", .{self.replica});
775
+ return;
776
+ }
777
+
778
+ if (self.leader()) {
779
+ log.warn("{}: on_commit: ignoring (leader)", .{self.replica});
780
+ return;
781
+ }
782
+
783
+ assert(self.status == .normal);
784
+ assert(self.follower());
785
+ assert(message.header.view == self.view);
786
+ assert(message.header.replica == self.leader_index(message.header.view));
787
+
788
+ // We may not always have the latest commit entry but if we do our checksum must match:
789
+ if (self.journal.entry_for_op_exact(message.header.commit)) |commit_entry| {
790
+ if (commit_entry.checksum == message.header.context) {
791
+ log.debug("{}: on_commit: checksum verified", .{self.replica});
792
+ } else if (self.valid_hash_chain("on_commit")) {
793
+ @panic("commit checksum verification failed");
794
+ } else {
795
+ // We may still be repairing after receiving the start_view message.
796
+ log.debug("{}: on_commit: skipping checksum verification", .{self.replica});
797
+ }
798
+ }
799
+
800
+ self.normal_status_timeout.reset();
801
+
802
+ self.commit_ops(message.header.commit);
803
+ }
804
+
805
+ fn on_repair(self: *Self, message: *Message) void {
806
+ assert(message.header.command == .prepare);
807
+
808
+ if (self.status != .normal and self.status != .view_change) {
809
+ log.debug("{}: on_repair: ignoring ({})", .{ self.replica, self.status });
810
+ return;
811
+ }
812
+
813
+ if (message.header.view > self.view) {
814
+ log.debug("{}: on_repair: ignoring (newer view)", .{self.replica});
815
+ return;
816
+ }
817
+
818
+ if (self.status == .view_change and message.header.view == self.view) {
819
+ log.debug("{}: on_repair: ignoring (view started)", .{self.replica});
820
+ return;
821
+ }
822
+
823
+ if (self.status == .view_change and self.leader_index(self.view) != self.replica) {
824
+ log.debug("{}: on_repair: ignoring (view change, follower)", .{self.replica});
825
+ return;
826
+ }
827
+
828
+ if (self.status == .view_change and !self.do_view_change_quorum) {
829
+ log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{self.replica});
830
+ return;
831
+ }
832
+
833
+ if (message.header.op > self.op) {
834
+ assert(message.header.view < self.view);
835
+ log.debug("{}: on_repair: ignoring (would advance self.op)", .{self.replica});
836
+ return;
837
+ }
838
+
839
+ assert(self.status == .normal or self.status == .view_change);
840
+ assert(self.repairs_allowed());
841
+ assert(message.header.view <= self.view);
842
+ assert(message.header.op <= self.op); // Repairs may never advance `self.op`.
843
+
844
+ if (self.journal.has_clean(message.header)) {
845
+ log.debug("{}: on_repair: ignoring (duplicate)", .{self.replica});
846
+ self.send_prepare_ok(message.header);
847
+ defer self.flush_loopback_queue();
848
+ return;
849
+ }
850
+
851
+ if (self.repair_header(message.header)) {
852
+ assert(self.journal.has_dirty(message.header));
853
+
854
+ if (self.nack_prepare_op) |nack_prepare_op| {
855
+ if (nack_prepare_op == message.header.op) {
856
+ log.debug("{}: on_repair: repairing uncommitted op={}", .{
857
+ self.replica,
858
+ message.header.op,
859
+ });
860
+ self.reset_quorum_nack_prepare();
861
+ }
862
+ }
863
+
864
+ log.debug("{}: on_repair: repairing journal", .{self.replica});
865
+ self.write_prepare(message, .repair);
866
+ }
867
+ }
868
+
869
+ fn on_start_view_change(self: *Self, message: *Message) void {
870
+ if (self.ignore_view_change_message(message)) return;
871
+
872
+ assert(self.status == .normal or self.status == .view_change);
873
+ assert(message.header.view >= self.view);
874
+ assert(message.header.replica != self.replica);
875
+
876
+ self.view_jump(message.header);
877
+
878
+ assert(self.status == .view_change);
879
+ assert(message.header.view == self.view);
880
+
881
+ if (self.leader_index(self.view) == self.replica) {
882
+ // If we are the leader of the new view, then wait until we have a message to send a
883
+ // do_view_change message to ourself. The on_do_view_change() handler will panic if
884
+ // we received a start_view_change quorum without a do_view_change to ourself.
885
+ if (self.message_bus.get_message()) |available| {
886
+ self.message_bus.unref(available);
887
+ } else {
888
+ log.alert("{}: on_start_view_change: waiting for message for do_view_change", .{
889
+ self.replica,
890
+ });
891
+ return;
892
+ }
893
+ }
894
+
895
+ // Wait until we have `f` messages (excluding ourself) for quorum:
896
+ assert(self.replica_count > 1);
897
+ const threshold = self.quorum_view_change - 1;
898
+
899
+ const count = self.add_message_and_receive_quorum_exactly_once(
900
+ &self.start_view_change_from_other_replicas,
901
+ message,
902
+ threshold,
903
+ ) orelse return;
904
+
905
+ assert(count == threshold);
906
+ assert(self.start_view_change_from_other_replicas[self.replica] == null);
907
+ log.debug("{}: on_start_view_change: view={} quorum received", .{
908
+ self.replica,
909
+ self.view,
910
+ });
911
+
912
+ assert(!self.start_view_change_quorum);
913
+ assert(!self.do_view_change_quorum);
914
+ self.start_view_change_quorum = true;
915
+
916
+ // When replica i receives start_view_change messages for its view from f other replicas,
917
+ // it sends a ⟨do_view_change v, l, v’, n, k, i⟩ message to the node that will be the
918
+ // primary in the new view. Here v is its view, l is its log, v′ is the view number of the
919
+ // latest view in which its status was normal, n is the op number, and k is the commit
920
+ // number.
921
+ self.send_do_view_change();
922
+ defer self.flush_loopback_queue();
923
+ }
924
+
925
+ /// When the new primary receives f + 1 do_view_change messages from different replicas
926
+ /// (including itself), it sets its view number to that in the messages and selects as the
927
+ /// new log the one contained in the message with the largest v′; if several messages have
928
+ /// the same v′ it selects the one among them with the largest n. It sets its op number to
929
+ /// that of the topmost entry in the new log, sets its commit number to the largest such
930
+ /// number it received in the do_view_change messages, changes its status to normal, and
931
+ /// informs the other replicas of the completion of the view change by sending
932
+ /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
933
+ /// op number, and k is the commit number.
934
+ fn on_do_view_change(self: *Self, message: *Message) void {
935
+ if (self.ignore_view_change_message(message)) return;
936
+
937
+ assert(self.status == .normal or self.status == .view_change);
938
+ assert(message.header.view >= self.view);
939
+ assert(self.leader_index(message.header.view) == self.replica);
940
+
941
+ self.view_jump(message.header);
942
+
943
+ assert(self.status == .view_change);
944
+ assert(message.header.view == self.view);
945
+
946
+ // We may receive a `do_view_change` quorum from other replicas, which already have a
947
+ // `start_view_change_quorum`, before we receive a `start_view_change_quorum`:
948
+ if (!self.start_view_change_quorum) {
949
+ log.debug("{}: on_do_view_change: waiting for start_view_change quorum", .{
950
+ self.replica,
951
+ });
952
+ return;
953
+ }
954
+
955
+ // Wait until we have `f + 1` messages (including ourself) for quorum:
956
+ assert(self.replica_count > 1);
957
+ const threshold = self.quorum_view_change;
958
+
959
+ const count = self.add_message_and_receive_quorum_exactly_once(
960
+ &self.do_view_change_from_all_replicas,
961
+ message,
962
+ threshold,
963
+ ) orelse return;
964
+
965
+ assert(count == threshold);
966
+ assert(self.do_view_change_from_all_replicas[self.replica] != null);
967
+ log.debug("{}: on_do_view_change: view={} quorum received", .{
968
+ self.replica,
969
+ self.view,
970
+ });
971
+
972
+ var v: ?u32 = null;
973
+ var k: ?u64 = null;
974
+ var latest = Header.reserved();
975
+
976
+ for (self.do_view_change_from_all_replicas) |received, replica| {
977
+ if (received) |m| {
978
+ assert(m.header.command == .do_view_change);
979
+ assert(m.header.cluster == self.cluster);
980
+ assert(m.header.replica == replica);
981
+ assert(m.header.view == self.view);
982
+
983
+ // The latest normal view experienced by this replica:
984
+ // This may be higher than the view in any of the prepare headers.
985
+ var replica_view_normal = @intCast(u32, m.header.offset);
986
+ assert(replica_view_normal < m.header.view);
987
+
988
+ var replica_latest = Header.reserved();
989
+ self.set_latest_op(self.message_body_as_headers(m), &replica_latest);
990
+ assert(replica_latest.op == m.header.op);
991
+
992
+ log.debug(
993
+ "{}: on_do_view_change: replica={} v'={} op={} commit={} latest={}",
994
+ .{
995
+ self.replica,
996
+ m.header.replica,
997
+ replica_view_normal,
998
+ m.header.op,
999
+ m.header.commit,
1000
+ replica_latest,
1001
+ },
1002
+ );
1003
+
1004
+ if (v == null or replica_view_normal > v.?) {
1005
+ v = replica_view_normal;
1006
+ latest = replica_latest;
1007
+ } else if (replica_view_normal == v.? and replica_latest.op > latest.op) {
1008
+ v = replica_view_normal;
1009
+ latest = replica_latest;
1010
+ }
1011
+
1012
+ if (k == null or m.header.commit > k.?) k = m.header.commit;
1013
+ }
1014
+ }
1015
+
1016
+ self.set_latest_op_and_k(&latest, k.?, "on_do_view_change");
1017
+
1018
+ // Now that we have the latest op in place, repair any other headers:
1019
+ for (self.do_view_change_from_all_replicas) |received| {
1020
+ if (received) |m| {
1021
+ for (self.message_body_as_headers(m)) |*h| {
1022
+ _ = self.repair_header(h);
1023
+ }
1024
+ }
1025
+ }
1026
+
1027
+ // Verify that the repairs above have not replaced or advanced the latest op:
1028
+ assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
1029
+
1030
+ assert(self.start_view_change_quorum);
1031
+ assert(!self.do_view_change_quorum);
1032
+ self.do_view_change_quorum = true;
1033
+
1034
+ self.discard_uncommitted_headers();
1035
+ assert(self.op >= self.commit_max);
1036
+ assert(self.journal.entry_for_op_exact(self.op) != null);
1037
+
1038
+ // Start repairs according to the CTRL protocol:
1039
+ assert(!self.repair_timeout.ticking);
1040
+ self.repair_timeout.start();
1041
+ self.repair();
1042
+ }
1043
+
1044
+ /// When other replicas receive the start_view message, they replace their log with the one
1045
+ /// in the message, set their op number to that of the latest entry in the log, set their
1046
+ /// view number to the view number in the message, change their status to normal, and update
1047
+ /// the information in their client table. If there are non-committed operations in the log,
1048
+ /// they send a ⟨prepare_ok v, n, i⟩ message to the primary; here n is the op-number. Then
1049
+ /// they execute all operations known to be committed that they haven’t executed previously,
1050
+ /// advance their commit number, and update the information in their client table.
1051
+ fn on_start_view(self: *Self, message: *const Message) void {
1052
+ if (self.ignore_view_change_message(message)) return;
1053
+
1054
+ assert(self.status == .view_change or self.status == .normal);
1055
+ assert(message.header.view >= self.view);
1056
+ assert(message.header.replica != self.replica);
1057
+ assert(message.header.replica == self.leader_index(message.header.view));
1058
+
1059
+ self.view_jump(message.header);
1060
+
1061
+ assert(self.status == .view_change);
1062
+ assert(message.header.view == self.view);
1063
+
1064
+ var latest = Header.reserved();
1065
+ self.set_latest_op(self.message_body_as_headers(message), &latest);
1066
+ assert(latest.op == message.header.op);
1067
+
1068
+ self.set_latest_op_and_k(&latest, message.header.commit, "on_start_view");
1069
+
1070
+ // Now that we have the latest op in place, repair any other headers:
1071
+ for (self.message_body_as_headers(message)) |*h| {
1072
+ _ = self.repair_header(h);
1073
+ }
1074
+
1075
+ // Verify that the repairs above have not replaced or advanced the latest op:
1076
+ assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
1077
+
1078
+ if (self.status == .view_change) {
1079
+ self.transition_to_normal_status(message.header.view);
1080
+ self.send_prepare_oks_after_view_change();
1081
+ }
1082
+
1083
+ assert(self.status == .normal);
1084
+ assert(message.header.view == self.view);
1085
+ assert(self.follower());
1086
+
1087
+ self.commit_ops(self.commit_max);
1088
+
1089
+ self.repair();
1090
+ }
1091
+
1092
+ fn on_request_start_view(self: *Self, message: *const Message) void {
1093
+ if (self.ignore_repair_message(message)) return;
1094
+
1095
+ assert(self.status == .normal);
1096
+ assert(message.header.view == self.view);
1097
+ assert(message.header.replica != self.replica);
1098
+ assert(self.leader());
1099
+
1100
+ const start_view = self.create_view_change_message(.start_view) orelse {
1101
+ log.alert("{}: on_request_start_view: dropping start_view, no message available", .{
1102
+ self.replica,
1103
+ });
1104
+ return;
1105
+ };
1106
+ defer self.message_bus.unref(start_view);
1107
+
1108
+ assert(start_view.references == 1);
1109
+ assert(start_view.header.command == .start_view);
1110
+ assert(start_view.header.view == self.view);
1111
+ assert(start_view.header.op == self.op);
1112
+ assert(start_view.header.commit == self.commit_max);
1113
+
1114
+ self.send_message_to_replica(message.header.replica, start_view);
1115
+ }
1116
+
1117
+ /// TODO This is a work in progress (out of scope for the bounty)
1118
+ fn on_recovery(self: *Self, message: *const Message) void {
1119
+ if (self.status != .normal) {
1120
+ log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
1121
+ return;
1122
+ }
1123
+
1124
+ if (message.header.replica == self.replica) {
1125
+ log.warn("{}: on_recovery: ignoring (self)", .{self.replica});
1126
+ return;
1127
+ }
1128
+
1129
+ const response = self.message_bus.get_message() orelse {
1130
+ log.alert("{}: on_recovery: ignoring (waiting for message)", .{self.replica});
1131
+ return;
1132
+ };
1133
+ defer self.message_bus.unref(response);
1134
+
1135
+ response.header.* = .{
1136
+ .command = .recovery_response,
1137
+ .cluster = self.cluster,
1138
+ .context = message.header.context,
1139
+ .replica = self.replica,
1140
+ .view = self.view,
1141
+ .op = self.op,
1142
+ .commit = self.commit_max,
1143
+ };
1144
+
1145
+ const count_max = 8; // The number of prepare headers to include in the body.
1146
+
1147
+ const size_max = @sizeOf(Header) * std.math.min(
1148
+ std.math.max(@divFloor(response.buffer.len, @sizeOf(Header)), 2),
1149
+ 1 + count_max,
1150
+ );
1151
+ assert(size_max > @sizeOf(Header));
1152
+
1153
+ const count = self.journal.copy_latest_headers_between(
1154
+ 0,
1155
+ self.op,
1156
+ std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1157
+ );
1158
+
1159
+ // We expect that self.op always exists.
1160
+ assert(count > 0);
1161
+
1162
+ response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1163
+
1164
+ response.header.set_checksum_body(response.body());
1165
+ response.header.set_checksum();
1166
+
1167
+ assert(self.status == .normal);
1168
+ // The checksum for a recovery message is deterministic, and cannot be used as a nonce:
1169
+ assert(response.header.context != message.header.checksum);
1170
+
1171
+ self.send_message_to_replica(message.header.replica, response);
1172
+ }
1173
+
1174
+ /// TODO This is a work in progress (out of scope for the bounty)
1175
+ fn on_recovery_response(self: *Self, message: *Message) void {}
1176
+
1177
+ fn on_request_prepare(self: *Self, message: *const Message) void {
1178
+ if (self.ignore_repair_message(message)) return;
1179
+
1180
+ assert(self.status == .normal or self.status == .view_change);
1181
+ assert(message.header.view == self.view);
1182
+ assert(message.header.replica != self.replica);
1183
+
1184
+ const op = message.header.op;
1185
+ var checksum: ?u128 = message.header.context;
1186
+ if (self.leader_index(self.view) == self.replica and checksum.? == 0) checksum = null;
1187
+
1188
+ if (self.journal.entry_for_op_exact_with_checksum(op, checksum)) |entry| {
1189
+ assert(entry.op == op);
1190
+ assert(checksum == null or entry.checksum == checksum.?);
1191
+
1192
+ if (!self.journal.dirty.bit(op)) {
1193
+ assert(!self.journal.faulty.bit(op));
1194
+
1195
+ log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
1196
+ self.replica,
1197
+ op,
1198
+ checksum,
1199
+ });
1200
+
1201
+ // TODO Do not reissue the read if we are already reading in order to send to
1202
+ // this particular destination replica.
1203
+
1204
+ self.journal.read_prepare(
1205
+ on_request_prepare_read,
1206
+ op,
1207
+ entry.checksum,
1208
+ message.header.replica,
1209
+ );
1210
+
1211
+ // We have guaranteed the prepare and our copy is clean (not safe to nack).
1212
+ return;
1213
+ } else if (self.journal.faulty.bit(op)) {
1214
+ log.debug("{}: on_request_prepare: op={} checksum={} faulty", .{
1215
+ self.replica,
1216
+ op,
1217
+ checksum,
1218
+ });
1219
+
1220
+ // We have gauranteed the prepare but our copy is faulty (not safe to nack).
1221
+ return;
1222
+ }
1223
+
1224
+ // We know of the prepare but we have yet to write or guarantee it (safe to nack).
1225
+ // Continue through below...
1226
+ }
1227
+
1228
+ if (self.status == .view_change) {
1229
+ assert(message.header.replica == self.leader_index(self.view));
1230
+ assert(checksum != null);
1231
+ if (self.journal.entry_for_op_exact_with_checksum(op, checksum) != null) {
1232
+ assert(self.journal.dirty.bit(op) and !self.journal.faulty.bit(op));
1233
+ }
1234
+
1235
+ log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
1236
+ self.replica,
1237
+ op,
1238
+ checksum,
1239
+ });
1240
+
1241
+ self.send_header_to_replica(message.header.replica, .{
1242
+ .command = .nack_prepare,
1243
+ .context = checksum.?,
1244
+ .cluster = self.cluster,
1245
+ .replica = self.replica,
1246
+ .view = self.view,
1247
+ .op = op,
1248
+ });
1249
+ }
1250
+ }
1251
+
1252
+ fn on_request_prepare_read(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
1253
+ const message = prepare orelse {
1254
+ log.debug("{}: on_request_prepare_read: prepare=null", .{self.replica});
1255
+ return;
1256
+ };
1257
+
1258
+ log.debug("{}: on_request_prepare_read: op={} checksum={} sending to replica={}", .{
1259
+ self.replica,
1260
+ message.header.op,
1261
+ message.header.checksum,
1262
+ destination_replica.?,
1263
+ });
1264
+
1265
+ assert(destination_replica.? != self.replica);
1266
+ self.send_message_to_replica(destination_replica.?, message);
1267
+ }
1268
+
1269
+ fn on_request_headers(self: *Self, message: *const Message) void {
1270
+ if (self.ignore_repair_message(message)) return;
1271
+
1272
+ assert(self.status == .normal or self.status == .view_change);
1273
+ assert(message.header.view == self.view);
1274
+ assert(message.header.replica != self.replica);
1275
+
1276
+ const response = self.message_bus.get_message() orelse {
1277
+ log.alert("{}: on_request_headers: ignoring (op={}..{}, no message available)", .{
1278
+ self.replica,
1279
+ message.header.commit,
1280
+ message.header.op,
1281
+ });
1282
+ return;
1283
+ };
1284
+ defer self.message_bus.unref(response);
1285
+
1286
+ response.header.* = .{
1287
+ .command = .headers,
1288
+ // We echo the context back to the replica so that they can match up our response:
1289
+ .context = message.header.context,
1290
+ .cluster = self.cluster,
1291
+ .replica = self.replica,
1292
+ .view = self.view,
1293
+ };
1294
+
1295
+ const op_min = message.header.commit;
1296
+ const op_max = message.header.op;
1297
+ assert(op_max >= op_min);
1298
+
1299
+ // We must add 1 because op_max and op_min are both inclusive:
1300
+ const count_max = @intCast(u32, std.math.min(64, op_max - op_min + 1));
1301
+ assert(count_max > 0);
1302
+
1303
+ const size_max = @sizeOf(Header) * std.math.min(
1304
+ std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
1305
+ 1 + count_max,
1306
+ );
1307
+ assert(size_max > @sizeOf(Header));
1308
+
1309
+ const count = self.journal.copy_latest_headers_between(
1310
+ op_min,
1311
+ op_max,
1312
+ std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1313
+ );
1314
+
1315
+ if (count == 0) {
1316
+ log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
1317
+ self.replica,
1318
+ op_min,
1319
+ op_max,
1320
+ });
1321
+ return;
1322
+ }
1323
+
1324
+ response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1325
+
1326
+ response.header.set_checksum_body(response.body());
1327
+ response.header.set_checksum();
1328
+
1329
+ self.send_message_to_replica(message.header.replica, response);
1330
+ }
1331
+
1332
+ fn on_nack_prepare(self: *Self, message: *Message) void {
1333
+ if (self.ignore_repair_message(message)) return;
1334
+
1335
+ assert(self.status == .view_change);
1336
+ assert(message.header.view == self.view);
1337
+ assert(message.header.replica != self.replica);
1338
+ assert(self.leader_index(self.view) == self.replica);
1339
+ assert(self.do_view_change_quorum);
1340
+ assert(self.repairs_allowed());
1341
+
1342
+ if (self.nack_prepare_op == null) {
1343
+ log.debug("{}: on_nack_prepare: ignoring (no longer expected)", .{self.replica});
1344
+ return;
1345
+ }
1346
+
1347
+ const op = self.nack_prepare_op.?;
1348
+ const checksum = self.journal.entry_for_op_exact(op).?.checksum;
1349
+
1350
+ if (message.header.op != op) {
1351
+ log.debug("{}: on_nack_prepare: ignoring (repairing another op)", .{self.replica});
1352
+ return;
1353
+ }
1354
+
1355
+ if (message.header.context != checksum) {
1356
+ log.debug("{}: on_nack_prepare: ignoring (repairing another checksum)", .{
1357
+ self.replica,
1358
+ });
1359
+ return;
1360
+ }
1361
+
1362
+ // Followers may not send a `nack_prepare` for a different checksum:
1363
+ // However our op may change in between sending the request and getting the nack.
1364
+ assert(message.header.op == op);
1365
+ assert(message.header.context == checksum);
1366
+
1367
+ // Here are what our nack quorums look like, if we know our op is faulty:
1368
+ // These are for various replication quorums under Flexible Paxos.
1369
+ // We need to have enough nacks to guarantee that `quorum_replication` was not reached,
1370
+ // because if the replication quorum was reached, then it may have been committed.
1371
+ // We add `1` in each case because our op is faulty and may have been counted.
1372
+ //
1373
+ // replica_count=2 - quorum_replication=2 + 1 = 0 + 1 = 1 nacks required
1374
+ // replica_count=3 - quorum_replication=2 + 1 = 1 + 1 = 2 nacks required
1375
+ // replica_count=4 - quorum_replication=2 + 1 = 2 + 1 = 3 nacks required
1376
+ // replica_count=4 - quorum_replication=3 + 1 = 1 + 1 = 2 nacks required
1377
+ // replica_count=5 - quorum_replication=2 + 1 = 3 + 1 = 4 nacks required
1378
+ // replica_count=5 - quorum_replication=3 + 1 = 2 + 1 = 3 nacks required
1379
+ //
1380
+ // Otherwise, if we know we do not have the op, then we can exclude ourselves.
1381
+ assert(self.replica_count > 1);
1382
+
1383
+ const threshold = if (self.journal.faulty.bit(op))
1384
+ self.replica_count - self.quorum_replication + 1
1385
+ else
1386
+ self.replica_count - self.quorum_replication;
1387
+
1388
+ if (threshold == 0) {
1389
+ assert(self.replica_count == 2);
1390
+ assert(!self.journal.faulty.bit(op));
1391
+
1392
+ // This is a special case for a cluster-of-two, handled in `repair_prepare()`.
1393
+ log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
1394
+ self.replica,
1395
+ });
1396
+ return;
1397
+ }
1398
+
1399
+ log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
1400
+ self.replica,
1401
+ self.quorum_replication,
1402
+ threshold,
1403
+ });
1404
+
1405
+ // We should never expect to receive a nack from ourselves:
1406
+ // Detect if we ever set `threshold` to `quorum_view_change` for a cluster-of-two again.
1407
+ assert(threshold < self.replica_count);
1408
+
1409
+ // Wait until we have `threshold` messages for quorum:
1410
+ const count = self.add_message_and_receive_quorum_exactly_once(
1411
+ &self.nack_prepare_from_other_replicas,
1412
+ message,
1413
+ threshold,
1414
+ ) orelse return;
1415
+
1416
+ assert(count == threshold);
1417
+ assert(self.nack_prepare_from_other_replicas[self.replica] == null);
1418
+ log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
1419
+
1420
+ self.discard_uncommitted_ops_from(op, checksum);
1421
+ self.reset_quorum_nack_prepare();
1422
+ self.repair();
1423
+ }
1424
+
1425
+ fn on_headers(self: *Self, message: *const Message) void {
1426
+ if (self.ignore_repair_message(message)) return;
1427
+
1428
+ assert(self.status == .normal or self.status == .view_change);
1429
+ assert(message.header.view == self.view);
1430
+ assert(message.header.replica != self.replica);
1431
+
1432
+ // We expect at least one header in the body, or otherwise no response to our request.
1433
+ assert(message.header.size > @sizeOf(Header));
1434
+
1435
+ var op_min: ?u64 = null;
1436
+ var op_max: ?u64 = null;
1437
+ for (self.message_body_as_headers(message)) |*h| {
1438
+ if (op_min == null or h.op < op_min.?) op_min = h.op;
1439
+ if (op_max == null or h.op > op_max.?) op_max = h.op;
1440
+ _ = self.repair_header(h);
1441
+ }
1442
+ assert(op_max.? >= op_min.?);
1443
+
1444
+ self.repair();
1445
+ }
1446
+
1447
+ fn on_ping_timeout(self: *Self) void {
1448
+ self.ping_timeout.reset();
1449
+
1450
+ // TODO We may want to ping for connectivity during a view change.
1451
+ assert(self.status == .normal);
1452
+ assert(self.leader() or self.follower());
1453
+
1454
+ var ping = Header{
1455
+ .command = .ping,
1456
+ .cluster = self.cluster,
1457
+ .replica = self.replica,
1458
+ .view = self.view,
1459
+ .op = self.clock.monotonic(),
1460
+ };
1461
+
1462
+ self.send_header_to_other_replicas(ping);
1463
+ }
1464
+
1465
+ fn on_prepare_timeout(self: *Self) void {
1466
+ // We will decide below whether to reset or backoff the timeout.
1467
+ assert(self.status == .normal);
1468
+ assert(self.leader());
1469
+
1470
+ const prepare = self.pipeline.head_ptr().?;
1471
+ assert(prepare.message.header.command == .prepare);
1472
+
1473
+ if (prepare.ok_quorum_received) {
1474
+ self.prepare_timeout.reset();
1475
+
1476
+ // We were unable to commit at the time because we were waiting for a message.
1477
+ log.debug("{}: on_prepare_timeout: quorum already received, retrying commit", .{
1478
+ self.replica,
1479
+ });
1480
+ self.commit_pipeline();
1481
+ return;
1482
+ }
1483
+
1484
+ // The list of remote replicas yet to send a prepare_ok:
1485
+ var waiting: [config.replicas_max]u8 = undefined;
1486
+ var waiting_len: usize = 0;
1487
+ for (prepare.ok_from_all_replicas[0..self.replica_count]) |received, replica| {
1488
+ if (received == null and replica != self.replica) {
1489
+ waiting[waiting_len] = @intCast(u8, replica);
1490
+ waiting_len += 1;
1491
+ }
1492
+ }
1493
+
1494
+ if (waiting_len == 0) {
1495
+ self.prepare_timeout.reset();
1496
+
1497
+ log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
1498
+ assert(prepare.ok_from_all_replicas[self.replica] == null);
1499
+
1500
+ // We may be slow and waiting for the write to complete.
1501
+ //
1502
+ // We may even have maxed out our IO depth and been unable to initiate the write,
1503
+ // which can happen if `config.pipelining_max` exceeds `config.io_depth_write`.
1504
+ // This can lead to deadlock for a cluster of one or two (if we do not retry here),
1505
+ // since there is no other way for the leader to repair the dirty op because no
1506
+ // other replica has it.
1507
+ //
1508
+ // Retry the write through `on_repair()` which will work out which is which.
1509
+ // We do expect that the op would have been run through `on_prepare()` already.
1510
+ assert(prepare.message.header.op <= self.op);
1511
+ self.on_repair(prepare.message);
1512
+
1513
+ return;
1514
+ }
1515
+
1516
+ self.prepare_timeout.backoff(&self.prng);
1517
+
1518
+ assert(waiting_len <= self.replica_count);
1519
+ for (waiting[0..waiting_len]) |replica| {
1520
+ assert(replica < self.replica_count);
1521
+
1522
+ log.debug("{}: on_prepare_timeout: waiting for replica {}", .{
1523
+ self.replica,
1524
+ replica,
1525
+ });
1526
+ }
1527
+
1528
+ // Cycle through the list to reach live replicas and get around partitions:
1529
+ // We do not assert `prepare_timeout.attempts > 0` since the counter may wrap back to 0.
1530
+ const replica = waiting[self.prepare_timeout.attempts % waiting_len];
1531
+ assert(replica != self.replica);
1532
+
1533
+ log.debug("{}: on_prepare_timeout: replicating to replica {}", .{ self.replica, replica });
1534
+ self.send_message_to_replica(replica, prepare.message);
1535
+ }
1536
+
1537
+ fn on_commit_timeout(self: *Self) void {
1538
+ self.commit_timeout.reset();
1539
+
1540
+ assert(self.status == .normal);
1541
+ assert(self.leader());
1542
+ assert(self.commit_min == self.commit_max);
1543
+
1544
+ // TODO Snapshots: Use snapshot checksum if commit is no longer in journal.
1545
+ const latest_committed_entry = self.journal.entry_for_op_exact(self.commit_max).?;
1546
+
1547
+ self.send_header_to_other_replicas(.{
1548
+ .command = .commit,
1549
+ .context = latest_committed_entry.checksum,
1550
+ .cluster = self.cluster,
1551
+ .replica = self.replica,
1552
+ .view = self.view,
1553
+ .commit = self.commit_max,
1554
+ });
1555
+ }
1556
+
1557
+ fn on_normal_status_timeout(self: *Self) void {
1558
+ assert(self.status == .normal);
1559
+ assert(self.follower());
1560
+ self.transition_to_view_change_status(self.view + 1);
1561
+ }
1562
+
1563
+ fn on_view_change_status_timeout(self: *Self) void {
1564
+ assert(self.status == .view_change);
1565
+ self.transition_to_view_change_status(self.view + 1);
1566
+ }
1567
+
1568
+ fn on_view_change_message_timeout(self: *Self) void {
1569
+ self.view_change_message_timeout.reset();
1570
+ assert(self.status == .view_change);
1571
+
1572
+ // Keep sending `start_view_change` messages:
1573
+ // We may have a `start_view_change_quorum` but other replicas may not.
1574
+ // However, the leader may stop sending once it has a `do_view_change_quorum`.
1575
+ if (!self.do_view_change_quorum) self.send_start_view_change();
1576
+
1577
+ // It is critical that a `do_view_change` message implies a `start_view_change_quorum`:
1578
+ if (self.start_view_change_quorum) {
1579
+ // The leader need not retry to send a `do_view_change` message to itself:
1580
+ // We assume the MessageBus will not drop messages sent by a replica to itself.
1581
+ if (self.leader_index(self.view) != self.replica) self.send_do_view_change();
1582
+ }
1583
+ }
1584
+
1585
+ fn on_repair_timeout(self: *Self) void {
1586
+ assert(self.status == .normal or self.status == .view_change);
1587
+ self.repair();
1588
+ }
1589
+
1590
+ fn add_message_and_receive_quorum_exactly_once(
1591
+ self: *Self,
1592
+ messages: *QuorumMessages,
1593
+ message: *Message,
1594
+ threshold: u32,
1595
+ ) ?usize {
1596
+ assert(threshold >= 1);
1597
+ assert(threshold <= self.replica_count);
1598
+
1599
+ assert(messages.len == config.replicas_max);
1600
+ assert(message.header.cluster == self.cluster);
1601
+ assert(message.header.replica < self.replica_count);
1602
+ assert(message.header.view == self.view);
1603
+ switch (message.header.command) {
1604
+ .prepare_ok => {
1605
+ if (self.replica_count <= 2) assert(threshold == self.replica_count);
1606
+
1607
+ assert(self.status == .normal);
1608
+ assert(self.leader());
1609
+ },
1610
+ .start_view_change => {
1611
+ assert(self.replica_count > 1);
1612
+ if (self.replica_count == 2) assert(threshold == 1);
1613
+
1614
+ assert(self.status == .view_change);
1615
+ },
1616
+ .do_view_change => {
1617
+ assert(self.replica_count > 1);
1618
+ if (self.replica_count == 2) assert(threshold == 2);
1619
+
1620
+ assert(self.status == .view_change);
1621
+ assert(self.leader_index(self.view) == self.replica);
1622
+ },
1623
+ .nack_prepare => {
1624
+ assert(self.replica_count > 1);
1625
+ if (self.replica_count == 2) assert(threshold >= 1);
1626
+
1627
+ assert(self.status == .view_change);
1628
+ assert(self.leader_index(self.view) == self.replica);
1629
+ },
1630
+ else => unreachable,
1631
+ }
1632
+
1633
+ const command: []const u8 = @tagName(message.header.command);
1634
+
1635
+ // Do not allow duplicate messages to trigger multiple passes through a state transition:
1636
+ if (messages[message.header.replica]) |m| {
1637
+ // Assert that this is a duplicate message and not a different message:
1638
+ assert(m.header.command == message.header.command);
1639
+ assert(m.header.replica == message.header.replica);
1640
+ assert(m.header.view == message.header.view);
1641
+ assert(m.header.op == message.header.op);
1642
+ assert(m.header.commit == message.header.commit);
1643
+ assert(m.header.checksum_body == message.header.checksum_body);
1644
+ assert(m.header.checksum == message.header.checksum);
1645
+ log.debug("{}: on_{s}: ignoring (duplicate message)", .{ self.replica, command });
1646
+ return null;
1647
+ }
1648
+
1649
+ // Record the first receipt of this message:
1650
+ assert(messages[message.header.replica] == null);
1651
+ messages[message.header.replica] = message.ref();
1652
+
1653
+ // Count the number of unique messages now received:
1654
+ const count = self.count_quorum(messages, message.header.command, message.header.context);
1655
+ log.debug("{}: on_{s}: {} message(s)", .{ self.replica, command, count });
1656
+
1657
+ // Wait until we have exactly `threshold` messages for quorum:
1658
+ if (count < threshold) {
1659
+ log.debug("{}: on_{s}: waiting for quorum", .{ self.replica, command });
1660
+ return null;
1661
+ }
1662
+
1663
+ // This is not the first time we have had quorum, the state transition has already happened:
1664
+ if (count > threshold) {
1665
+ log.debug("{}: on_{s}: ignoring (quorum received already)", .{ self.replica, command });
1666
+ return null;
1667
+ }
1668
+
1669
+ assert(count == threshold);
1670
+ return count;
1671
+ }
1672
+
1673
+ fn append(self: *Self, message: *Message) void {
1674
+ assert(self.status == .normal);
1675
+ assert(message.header.command == .prepare);
1676
+ assert(message.header.view == self.view);
1677
+ assert(message.header.op == self.op);
1678
+
1679
+ log.debug("{}: append: appending to journal", .{self.replica});
1680
+ self.write_prepare(message, .append);
1681
+ }
1682
+
1683
+ /// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
1684
+ fn ascending_viewstamps(
1685
+ self: *Self,
1686
+ a: *const Header,
1687
+ b: *const Header,
1688
+ ) bool {
1689
+ assert(a.command == .prepare);
1690
+ assert(b.command == .prepare);
1691
+
1692
+ if (a.view < b.view) {
1693
+ // We do not assert b.op >= a.op, ops may be reordered during a view change.
1694
+ return true;
1695
+ } else if (a.view > b.view) {
1696
+ // We do not assert b.op <= a.op, ops may be reordered during a view change.
1697
+ return false;
1698
+ } else if (a.op < b.op) {
1699
+ assert(a.view == b.view);
1700
+ return true;
1701
+ } else if (a.op > b.op) {
1702
+ assert(a.view == b.view);
1703
+ return false;
1704
+ } else {
1705
+ unreachable;
1706
+ }
1707
+ }
1708
+
1709
+ /// Choose a different replica each time if possible (excluding ourself).
1710
+ fn choose_any_other_replica(self: *Self) ?u8 {
1711
+ if (self.replica_count == 1) return null;
1712
+
1713
+ var count: usize = 0;
1714
+ while (count < self.replica_count) : (count += 1) {
1715
+ self.choose_any_other_replica_ticks += 1;
1716
+ const replica = @mod(
1717
+ self.replica + self.choose_any_other_replica_ticks,
1718
+ self.replica_count,
1719
+ );
1720
+ if (replica == self.replica) continue;
1721
+ return @intCast(u8, replica);
1722
+ }
1723
+ unreachable;
1724
+ }
1725
+
1726
+ /// Commit ops up to commit number `commit` (inclusive).
1727
+ /// A function which calls `commit_ops()` to set `commit_max` must first call `view_jump()`.
1728
+ /// Otherwise, we may fork the log.
1729
+ fn commit_ops(self: *Self, commit: u64) void {
1730
+ // TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
1731
+ // Be careful of concurrency when doing this, as successive view changes can happen quickly.
1732
+ assert(self.status == .normal or self.status == .view_change);
1733
+ assert(self.commit_min <= self.commit_max);
1734
+ assert(self.commit_min <= self.op);
1735
+ assert(self.commit_max <= self.op or self.commit_max > self.op);
1736
+ assert(commit <= self.op or commit > self.op);
1737
+
1738
+ // We have already committed this far:
1739
+ if (commit <= self.commit_min) return;
1740
+
1741
+ // We must update `commit_max` even if we are already committing, otherwise we will lose
1742
+ // information that we should know, and `set_latest_op_and_k()` will catch us out:
1743
+ if (commit > self.commit_max) {
1744
+ log.debug("{}: commit_ops: advancing commit_max={}..{}", .{
1745
+ self.replica,
1746
+ self.commit_max,
1747
+ commit,
1748
+ });
1749
+ self.commit_max = commit;
1750
+ }
1751
+
1752
+ // Guard against multiple concurrent invocations of commit_ops():
1753
+ if (self.committing) {
1754
+ log.debug("{}: commit_ops: already committing...", .{self.replica});
1755
+ return;
1756
+ }
1757
+
1758
+ // We check the hash chain before we read each op, rather than once upfront, because
1759
+ // it's possible for `commit_max` to change while we read asynchronously, after we
1760
+ // validate the hash chain.
1761
+ //
1762
+ // We therefore cannot keep committing until we reach `commit_max`. We need to verify
1763
+ // the hash chain before each read. Once verified (before the read) we can commit in the
1764
+ // callback after the read, but if we see a change we need to stop committing any
1765
+ // further ops, because `commit_max` may have been bumped and may refer to a different
1766
+ // op.
1767
+
1768
+ assert(!self.committing);
1769
+ self.committing = true;
1770
+
1771
+ self.commit_ops_read();
1772
+ }
1773
+
1774
+ fn commit_ops_read(self: *Self) void {
1775
+ assert(self.committing);
1776
+ assert(self.status == .normal or self.status == .view_change);
1777
+ assert(self.commit_min <= self.commit_max);
1778
+ assert(self.commit_min <= self.op);
1779
+
1780
+ if (!self.valid_hash_chain("commit_ops_read")) {
1781
+ self.committing = false;
1782
+ return;
1783
+ }
1784
+ assert(self.op >= self.commit_max);
1785
+
1786
+ // We may receive commit numbers for ops we do not yet have (`commit_max > self.op`):
1787
+ // Even a naive state transfer may fail to correct for this.
1788
+ if (self.commit_min < self.commit_max and self.commit_min < self.op) {
1789
+ const op = self.commit_min + 1;
1790
+ const checksum = self.journal.entry_for_op_exact(op).?.checksum;
1791
+ self.journal.read_prepare(commit_ops_commit, op, checksum, null);
1792
+ } else {
1793
+ self.committing = false;
1794
+ // This is an optimization to expedite the view change before the `repair_timeout`:
1795
+ if (self.status == .view_change and self.repairs_allowed()) self.repair();
1796
+ }
1797
+ }
1798
+
1799
+ fn commit_ops_commit(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
1800
+ assert(destination_replica == null);
1801
+
1802
+ assert(self.committing);
1803
+ self.committing = false;
1804
+
1805
+ if (prepare == null) {
1806
+ log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
1807
+ return;
1808
+ }
1809
+
1810
+ if (self.status == .view_change) {
1811
+ if (self.leader_index(self.view) != self.replica) {
1812
+ log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
1813
+ return;
1814
+ }
1815
+
1816
+ // Only the leader may commit during a view change before starting the new view.
1817
+ // Fall through if this is indeed the case.
1818
+ } else if (self.status != .normal) {
1819
+ log.debug("{}: commit_ops_commit: no longer in normal status", .{self.replica});
1820
+ return;
1821
+ }
1822
+
1823
+ const op = self.commit_min + 1;
1824
+
1825
+ if (prepare.?.header.op != op) {
1826
+ log.debug("{}: commit_ops_commit: op changed", .{self.replica});
1827
+ return;
1828
+ }
1829
+
1830
+ if (prepare.?.header.checksum != self.journal.entry_for_op_exact(op).?.checksum) {
1831
+ log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
1832
+ return;
1833
+ }
1834
+
1835
+ // TODO We can optimize this to commit into the client table reply if it exists.
1836
+ const reply = self.message_bus.get_message() orelse {
1837
+ log.alert("{}: commit_ops_commit: waiting for message", .{self.replica});
1838
+ return;
1839
+ };
1840
+ defer self.message_bus.unref(reply);
1841
+
1842
+ self.commit_op(prepare.?, reply);
1843
+
1844
+ assert(self.commit_min == op);
1845
+ assert(self.commit_min <= self.commit_max);
1846
+ assert(self.commit_min <= self.op);
1847
+
1848
+ self.committing = true;
1849
+ self.commit_ops_read();
1850
+ }
1851
+
1852
+ fn commit_op(self: *Self, prepare: *const Message, reply: *Message) void {
1853
+ // TODO Can we add more checks around allowing commit_op() during a view change?
1854
+ assert(self.status == .normal or self.status == .view_change);
1855
+ assert(prepare.header.command == .prepare);
1856
+ assert(prepare.header.operation != .init);
1857
+ assert(prepare.header.op == self.commit_min + 1);
1858
+ assert(prepare.header.op <= self.op);
1859
+
1860
+ // If we are a follower committing through `commit_ops()` then a view change may have
1861
+ // happened since we last checked in `commit_ops_read()`. However, this would relate to
1862
+ // subsequent ops, since by now we have already verified the hash chain for this commit.
1863
+
1864
+ assert(self.journal.entry_for_op_exact(self.commit_min).?.checksum ==
1865
+ prepare.header.parent);
1866
+
1867
+ log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
1868
+ self.replica,
1869
+ self.view,
1870
+ self.leader_index(self.view) == self.replica,
1871
+ prepare.header.op,
1872
+ prepare.header.checksum,
1873
+ @tagName(prepare.header.operation.cast(StateMachine)),
1874
+ });
1875
+
1876
+ const reply_body_size = @intCast(u32, self.state_machine.commit(
1877
+ prepare.header.client,
1878
+ prepare.header.operation.cast(StateMachine),
1879
+ prepare.buffer[@sizeOf(Header)..prepare.header.size],
1880
+ reply.buffer[@sizeOf(Header)..],
1881
+ ));
1882
+
1883
+ self.commit_min += 1;
1884
+ assert(self.commit_min == prepare.header.op);
1885
+ if (self.commit_min > self.commit_max) self.commit_max = self.commit_min;
1886
+
1887
+ if (self.on_change_state) |hook| hook(self);
1888
+
1889
+ reply.header.* = .{
1890
+ .command = .reply,
1891
+ .operation = prepare.header.operation,
1892
+ .parent = prepare.header.context, // The prepare's context has `request.checksum`.
1893
+ .client = prepare.header.client,
1894
+ .request = prepare.header.request,
1895
+ .cluster = prepare.header.cluster,
1896
+ .replica = prepare.header.replica,
1897
+ .view = prepare.header.view,
1898
+ .op = prepare.header.op,
1899
+ .commit = prepare.header.op,
1900
+ .size = @sizeOf(Header) + reply_body_size,
1901
+ };
1902
+ assert(reply.header.offset == 0);
1903
+ assert(reply.header.epoch == 0);
1904
+
1905
+ reply.header.set_checksum_body(reply.buffer[@sizeOf(Header)..reply.header.size]);
1906
+ reply.header.set_checksum();
1907
+
1908
+ if (reply.header.operation == .register) {
1909
+ self.create_client_table_entry(reply);
1910
+ } else {
1911
+ self.update_client_table_entry(reply);
1912
+ }
1913
+
1914
+ if (self.leader_index(self.view) == self.replica) {
1915
+ log.debug("{}: commit_op: replying to client: {}", .{ self.replica, reply.header });
1916
+ self.message_bus.send_message_to_client(reply.header.client, reply);
1917
+ }
1918
+ }
1919
+
1920
+ /// Commits, frees and pops as many prepares at the head of the pipeline as have quorum.
1921
+ /// Can be called only when the pipeline has at least one prepare.
1922
+ /// Stops the prepare timeout and resets the timeouts counter if the pipeline becomes empty.
1923
+ fn commit_pipeline(self: *Self) void {
1924
+ assert(self.status == .normal);
1925
+ assert(self.leader());
1926
+ assert(self.pipeline.count > 0);
1927
+
1928
+ while (self.pipeline.head_ptr()) |prepare| {
1929
+ assert(self.pipeline.count > 0);
1930
+ assert(self.commit_min == self.commit_max);
1931
+ assert(self.commit_max + self.pipeline.count == self.op);
1932
+ assert(self.commit_max + 1 == prepare.message.header.op);
1933
+
1934
+ if (!prepare.ok_quorum_received) {
1935
+ // Eventually handled by on_prepare_timeout().
1936
+ log.debug("{}: commit_pipeline: waiting for quorum", .{self.replica});
1937
+ return;
1938
+ }
1939
+
1940
+ const count = self.count_quorum(
1941
+ &prepare.ok_from_all_replicas,
1942
+ .prepare_ok,
1943
+ prepare.message.header.checksum,
1944
+ );
1945
+ assert(count >= self.quorum_replication);
1946
+
1947
+ // TODO We can optimize this to commit into the client table reply if it exists.
1948
+ const reply = self.message_bus.get_message() orelse {
1949
+ // Eventually handled by on_prepare_timeout().
1950
+ log.alert("{}: commit_pipeline: waiting for message", .{self.replica});
1951
+ return;
1952
+ };
1953
+ defer self.message_bus.unref(reply);
1954
+
1955
+ self.commit_op(prepare.message, reply);
1956
+
1957
+ assert(self.commit_min == self.commit_max);
1958
+ assert(self.commit_max == prepare.message.header.op);
1959
+
1960
+ self.unref_prepare_message_and_quorum_messages(prepare);
1961
+ assert(self.pipeline.pop() != null);
1962
+ }
1963
+
1964
+ assert(self.prepare_timeout.ticking);
1965
+
1966
+ if (self.pipeline.count == 0) self.prepare_timeout.stop();
1967
+ }
1968
+
1969
+ fn count_quorum(
1970
+ self: *Self,
1971
+ messages: *QuorumMessages,
1972
+ command: Command,
1973
+ context: u128,
1974
+ ) usize {
1975
+ assert(messages.len == config.replicas_max);
1976
+ var count: usize = 0;
1977
+ for (messages) |received, replica| {
1978
+ if (received) |m| {
1979
+ assert(replica < self.replica_count);
1980
+ assert(m.header.cluster == self.cluster);
1981
+ assert(m.header.command == command);
1982
+ assert(m.header.context == context);
1983
+ assert(m.header.replica == replica);
1984
+ switch (command) {
1985
+ .prepare_ok => {
1986
+ if (self.status == .normal) {
1987
+ assert(self.leader());
1988
+ assert(m.header.view == self.view);
1989
+ } else {
1990
+ assert(self.status == .view_change);
1991
+ assert(m.header.view < self.view);
1992
+ }
1993
+ },
1994
+ .start_view_change => {
1995
+ assert(m.header.replica != self.replica);
1996
+ assert(m.header.view == self.view);
1997
+ },
1998
+ .do_view_change => assert(m.header.view == self.view),
1999
+ .nack_prepare => {
2000
+ // TODO See if we can restrict this branch further.
2001
+ assert(m.header.replica != self.replica);
2002
+ assert(m.header.op == self.nack_prepare_op.?);
2003
+ },
2004
+ else => unreachable,
2005
+ }
2006
+ count += 1;
2007
+ }
2008
+ }
2009
+ assert(count <= self.replica_count);
2010
+ return count;
2011
+ }
2012
+
2013
+ /// Creates an entry in the client table when registering a new client session.
2014
+ /// Asserts that the new session does not yet exist.
2015
+ /// Evicts another entry deterministically, if necessary, to make space for the insert.
2016
+ fn create_client_table_entry(self: *Self, reply: *Message) void {
2017
+ assert(reply.header.command == .reply);
2018
+ assert(reply.header.operation == .register);
2019
+ assert(reply.header.client > 0);
2020
+ assert(reply.header.context == 0);
2021
+ assert(reply.header.op == reply.header.commit);
2022
+ assert(reply.header.size == @sizeOf(Header));
2023
+
2024
+ const session = reply.header.commit; // The commit number becomes the session number.
2025
+ const request = reply.header.request;
2026
+
2027
+ assert(session > 0); // We reserved the `0` commit number for the cluster `.init` operation.
2028
+ assert(request == 0);
2029
+
2030
+ // For correctness, it's critical that all replicas evict deterministically:
2031
+ // We cannot depend on `HashMap.capacity()` since `HashMap.ensureCapacity()` may change
2032
+ // across different versions of the Zig std lib. We therefore rely on `config.clients_max`,
2033
+ // which must be the same across all replicas, and must not change after initing a cluster.
2034
+ // We also do not depend on `HashMap.valueIterator()` being deterministic here. However, we
2035
+ // do require that all entries have different commit numbers and are at least iterated.
2036
+ // This ensures that we will always pick the entry with the oldest commit number.
2037
+ // We also double-check that a client has only one entry in the hash map (or it's buggy).
2038
+ const clients = self.client_table.count();
2039
+ assert(clients <= config.clients_max);
2040
+ if (clients == config.clients_max) {
2041
+ var evictee: ?*Message = null;
2042
+ var iterated: usize = 0;
2043
+ var iterator = self.client_table.valueIterator();
2044
+ while (iterator.next()) |entry| : (iterated += 1) {
2045
+ assert(entry.reply.header.command == .reply);
2046
+ assert(entry.reply.header.context == 0);
2047
+ assert(entry.reply.header.op == entry.reply.header.commit);
2048
+ assert(entry.reply.header.commit >= entry.session);
2049
+
2050
+ if (evictee) |evictee_reply| {
2051
+ assert(entry.reply.header.client != evictee_reply.header.client);
2052
+ assert(entry.reply.header.commit != evictee_reply.header.commit);
2053
+
2054
+ if (entry.reply.header.commit < evictee_reply.header.commit) {
2055
+ evictee = entry.reply;
2056
+ }
2057
+ } else {
2058
+ evictee = entry.reply;
2059
+ }
2060
+ }
2061
+ assert(iterated == clients);
2062
+ log.alert("{}: create_client_table_entry: clients={}/{} evicting client={}", .{
2063
+ self.replica,
2064
+ clients,
2065
+ config.clients_max,
2066
+ evictee.?.header.client,
2067
+ });
2068
+ assert(self.client_table.remove(evictee.?.header.client));
2069
+ assert(!self.client_table.contains(evictee.?.header.client));
2070
+ self.message_bus.unref(evictee.?);
2071
+ }
2072
+
2073
+ log.debug("{}: create_client_table_entry: client={} session={} request={}", .{
2074
+ self.replica,
2075
+ reply.header.client,
2076
+ session,
2077
+ request,
2078
+ });
2079
+
2080
+ // Any duplicate .register requests should have received the same session number if the
2081
+ // client table entry already existed, or been dropped if a session was being committed:
2082
+ self.client_table.putAssumeCapacityNoClobber(reply.header.client, .{
2083
+ .session = session,
2084
+ .reply = reply.ref(),
2085
+ });
2086
+ assert(self.client_table.count() <= config.clients_max);
2087
+ }
2088
+
2089
+ /// The caller owns the returned message, if any, which has exactly 1 reference.
2090
+ fn create_view_change_message(self: *Self, command: Command) ?*Message {
2091
+ assert(command == .do_view_change or command == .start_view);
2092
+
2093
+ // We may send a start_view message in normal status to resolve a follower's view jump:
2094
+ assert(self.status == .normal or self.status == .view_change);
2095
+
2096
+ const message = self.message_bus.get_message() orelse return null;
2097
+ defer self.message_bus.unref(message);
2098
+
2099
+ message.header.* = .{
2100
+ .command = command,
2101
+ .cluster = self.cluster,
2102
+ .replica = self.replica,
2103
+ .view = self.view,
2104
+ // The latest normal view (as specified in the 2012 paper) is different to the view
2105
+ // number contained in the prepare headers we include in the body. The former shows
2106
+ // how recent a view change the replica participated in, which may be much higher.
2107
+ // We use the `offset` field to send this in addition to the current view number:
2108
+ .offset = if (command == .do_view_change) self.view_normal else 0,
2109
+ .op = self.op,
2110
+ .commit = self.commit_max,
2111
+ };
2112
+
2113
+ // CRITICAL: The number of prepare headers to include in the body:
2114
+ // We must provide enough headers to cover all uncommitted headers so that the new
2115
+ // leader (if we are in a view change) can decide whether to discard uncommitted headers
2116
+ // that cannot be repaired because they are gaps, and this must be relative to the
2117
+ // cluster as a whole (not relative to the difference between our op and commit number)
2118
+ // as otherwise we would break correctness.
2119
+ const count_max = config.pipelining_max;
2120
+ assert(count_max > 0);
2121
+
2122
+ const size_max = @sizeOf(Header) * std.math.min(
2123
+ std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
2124
+ 1 + count_max,
2125
+ );
2126
+ assert(size_max > @sizeOf(Header));
2127
+
2128
+ const count = self.journal.copy_latest_headers_between(
2129
+ 0,
2130
+ self.op,
2131
+ std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..size_max]),
2132
+ );
2133
+
2134
+ // We expect that self.op always exists.
2135
+ assert(count > 0);
2136
+
2137
+ message.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
2138
+
2139
+ message.header.set_checksum_body(message.body());
2140
+ message.header.set_checksum();
2141
+
2142
+ return message.ref();
2143
+ }
2144
+
2145
+ /// The caller owns the returned message, if any, which has exactly 1 reference.
2146
+ fn create_message_from_header(self: *Self, header: Header) ?*Message {
2147
+ assert(header.replica == self.replica);
2148
+ assert(header.view == self.view or header.command == .request_start_view);
2149
+ assert(header.size == @sizeOf(Header));
2150
+
2151
+ const message = self.message_bus.pool.get_header_only_message() orelse return null;
2152
+ defer self.message_bus.unref(message);
2153
+
2154
+ message.header.* = header;
2155
+ message.header.set_checksum_body(message.body());
2156
+ message.header.set_checksum();
2157
+
2158
+ return message.ref();
2159
+ }
2160
+
2161
+ /// Discards uncommitted headers during a view change before the new leader starts the view.
2162
+ /// This is required to maximize availability in the presence of storage faults.
2163
+ /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
2164
+ ///
2165
+ /// It's possible for the new leader to have done an op jump in a previous view, and so
2166
+ /// introduced a header gap for an op, which was then discarded by another leader during a
2167
+ /// newer view change, before surviving into this view as a gap because our latest op was
2168
+ /// set as the latest op for the quorum.
2169
+ ///
2170
+ /// In this case, it may be impossible for the new leader to repair the missing header since
2171
+ /// the rest of the cluster may have already discarded it. We therefore iterate over our
2172
+ /// uncommitted header gaps and compare them with the quorum of do_view_change messages
2173
+ /// received from other replicas, before starting the new view, to discard any that may be
2174
+ /// impossible to repair.
2175
+ fn discard_uncommitted_headers(self: *Self) void {
2176
+ assert(self.status == .view_change);
2177
+ assert(self.leader_index(self.view) == self.replica);
2178
+ assert(self.do_view_change_quorum);
2179
+ assert(!self.repair_timeout.ticking);
2180
+ assert(self.op >= self.commit_max);
2181
+ assert(self.replica_count > 1);
2182
+
2183
+ const threshold = self.replica_count - self.quorum_replication;
2184
+ if (threshold == 0) {
2185
+ assert(self.replica_count == 2);
2186
+ return;
2187
+ }
2188
+
2189
+ var op = self.op;
2190
+ while (op > self.commit_max) : (op -= 1) {
2191
+ if (self.journal.entry_for_op_exact(op) != null) continue;
2192
+
2193
+ log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
2194
+
2195
+ var nacks: usize = 0;
2196
+ for (self.do_view_change_from_all_replicas) |received, replica| {
2197
+ if (received) |m| {
2198
+ assert(m.header.command == .do_view_change);
2199
+ assert(m.header.cluster == self.cluster);
2200
+ assert(m.header.replica == replica);
2201
+ assert(m.header.view == self.view);
2202
+
2203
+ if (replica != self.replica) {
2204
+ if (m.header.op < op) nacks += 1;
2205
+
2206
+ log.debug("{}: discard_uncommitted_headers: replica={} op={}", .{
2207
+ self.replica,
2208
+ m.header.replica,
2209
+ m.header.op,
2210
+ });
2211
+ }
2212
+ }
2213
+ }
2214
+
2215
+ log.debug("{}: discard_uncommitted_headers: op={} nacks={} threshold={}", .{
2216
+ self.replica,
2217
+ op,
2218
+ nacks,
2219
+ threshold,
2220
+ });
2221
+
2222
+ if (nacks >= threshold) {
2223
+ self.journal.remove_entries_from(op);
2224
+ self.op = op - 1;
2225
+
2226
+ assert(self.journal.entry_for_op(op) == null);
2227
+ assert(!self.journal.dirty.bit(op));
2228
+ assert(!self.journal.faulty.bit(op));
2229
+ }
2230
+ }
2231
+ }
2232
+
2233
+ /// Discards uncommitted ops during a view change from after and including `op`.
2234
+ /// This is required to maximize availability in the presence of storage faults.
2235
+ /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
2236
+ fn discard_uncommitted_ops_from(self: *Self, op: u64, checksum: u128) void {
2237
+ assert(self.status == .view_change);
2238
+ assert(self.leader_index(self.view) == self.replica);
2239
+ assert(self.repairs_allowed());
2240
+
2241
+ assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
2242
+
2243
+ assert(op > self.commit_max);
2244
+ assert(op <= self.op);
2245
+ assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
2246
+ assert(self.journal.dirty.bit(op));
2247
+
2248
+ log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
2249
+ self.replica,
2250
+ op,
2251
+ self.op,
2252
+ self.view,
2253
+ });
2254
+
2255
+ self.journal.remove_entries_from(op);
2256
+ self.op = op - 1;
2257
+
2258
+ assert(self.journal.entry_for_op(op) == null);
2259
+ assert(!self.journal.dirty.bit(op));
2260
+ assert(!self.journal.faulty.bit(op));
2261
+
2262
+ // We require that `self.op` always exists. Rewinding `self.op` could change that.
2263
+ // However, we do this only as the leader within a view change, with all headers intact.
2264
+ assert(self.journal.entry_for_op_exact(self.op) != null);
2265
+ }
2266
+
2267
+ /// Returns whether the replica is a follower for the current view.
2268
+ /// This may be used only when the replica status is normal.
2269
+ fn follower(self: *Self) bool {
2270
+ return !self.leader();
2271
+ }
2272
+
2273
+ fn flush_loopback_queue(self: *Self) void {
2274
+ // There are three cases where a replica will send a message to itself:
2275
+ // However, of these three cases, only two cases will call send_message_to_replica().
2276
+ //
2277
+ // 1. In on_request(), the leader sends a synchronous prepare to itself, but this is
2278
+ // done by calling on_prepare() directly, and subsequent prepare timeout retries will
2279
+ // never resend to self.
2280
+ // 2. In on_prepare(), after writing to storage, the leader sends a (typically)
2281
+ // asynchronous prepare_ok to itself.
2282
+ // 3. In on_start_view_change(), after receiving a quorum of start_view_change
2283
+ // messages, the new leader sends a synchronous do_view_change to itself.
2284
+ if (self.loopback_queue) |message| {
2285
+ defer self.message_bus.unref(message);
2286
+
2287
+ assert(message.next == null);
2288
+ self.loopback_queue = null;
2289
+ assert(message.header.replica == self.replica);
2290
+ self.on_message(message);
2291
+ // We do not call flush_loopback_queue() within on_message() to avoid recursion.
2292
+ }
2293
+ // We expect that delivering a prepare_ok or do_view_change message to ourselves will
2294
+ // not result in any further messages being added synchronously to the loopback queue.
2295
+ assert(self.loopback_queue == null);
2296
+ }
2297
+
2298
+ fn ignore_prepare_ok(self: *Self, message: *const Message) bool {
2299
+ if (self.status != .normal) {
2300
+ log.debug("{}: on_prepare_ok: ignoring ({})", .{ self.replica, self.status });
2301
+ return true;
2302
+ }
2303
+
2304
+ if (message.header.view < self.view) {
2305
+ log.debug("{}: on_prepare_ok: ignoring (older view)", .{self.replica});
2306
+ return true;
2307
+ }
2308
+
2309
+ if (message.header.view > self.view) {
2310
+ // Another replica is treating us as the leader for a view we do not know about.
2311
+ // This may be caused by a fault in the network topology.
2312
+ log.warn("{}: on_prepare_ok: ignoring (newer view)", .{self.replica});
2313
+ return true;
2314
+ }
2315
+
2316
+ if (self.follower()) {
2317
+ // This may be caused by a fault in the network topology.
2318
+ log.warn("{}: on_prepare_ok: ignoring (follower)", .{self.replica});
2319
+ return true;
2320
+ }
2321
+
2322
+ return false;
2323
+ }
2324
+
2325
+ fn ignore_repair_message(self: *Self, message: *const Message) bool {
2326
+ assert(message.header.command == .request_start_view or
2327
+ message.header.command == .request_headers or
2328
+ message.header.command == .request_prepare or
2329
+ message.header.command == .headers or
2330
+ message.header.command == .nack_prepare);
2331
+
2332
+ const command: []const u8 = @tagName(message.header.command);
2333
+
2334
+ if (self.status != .normal and self.status != .view_change) {
2335
+ log.debug("{}: on_{s}: ignoring ({})", .{ self.replica, command, self.status });
2336
+ return true;
2337
+ }
2338
+
2339
+ if (message.header.view < self.view) {
2340
+ log.debug("{}: on_{s}: ignoring (older view)", .{ self.replica, command });
2341
+ return true;
2342
+ }
2343
+
2344
+ if (message.header.view > self.view) {
2345
+ log.debug("{}: on_{s}: ignoring (newer view)", .{ self.replica, command });
2346
+ return true;
2347
+ }
2348
+
2349
+ if (self.ignore_repair_message_during_view_change(message)) return true;
2350
+
2351
+ if (message.header.replica == self.replica) {
2352
+ log.warn("{}: on_{s}: ignoring (self)", .{ self.replica, command });
2353
+ return true;
2354
+ }
2355
+
2356
+ if (self.leader_index(self.view) != self.replica) {
2357
+ switch (message.header.command) {
2358
+ // Only the leader may receive these messages:
2359
+ .request_start_view, .nack_prepare => {
2360
+ log.warn("{}: on_{s}: ignoring (follower)", .{ self.replica, command });
2361
+ return true;
2362
+ },
2363
+ // Only the leader may answer a request for a prepare without a context:
2364
+ .request_prepare => if (message.header.context == 0) {
2365
+ log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
2366
+ return true;
2367
+ },
2368
+ else => {},
2369
+ }
2370
+ }
2371
+
2372
+ if (message.header.command == .nack_prepare and self.status == .normal) {
2373
+ log.debug("{}: on_{s}: ignoring (view started)", .{ self.replica, command });
2374
+ return true;
2375
+ }
2376
+
2377
+ // Only allow repairs for same view as defense-in-depth:
2378
+ assert(message.header.view == self.view);
2379
+ return false;
2380
+ }
2381
+
2382
+ fn ignore_repair_message_during_view_change(self: *Self, message: *const Message) bool {
2383
+ if (self.status != .view_change) return false;
2384
+
2385
+ const command: []const u8 = @tagName(message.header.command);
2386
+
2387
+ switch (message.header.command) {
2388
+ .request_start_view => {
2389
+ log.debug("{}: on_{s}: ignoring (view change)", .{ self.replica, command });
2390
+ return true;
2391
+ },
2392
+ .request_headers, .request_prepare => {
2393
+ if (self.leader_index(self.view) != message.header.replica) {
2394
+ log.debug("{}: on_{s}: ignoring (view change, requested by follower)", .{
2395
+ self.replica,
2396
+ command,
2397
+ });
2398
+ return true;
2399
+ }
2400
+ },
2401
+ .headers, .nack_prepare => {
2402
+ if (self.leader_index(self.view) != self.replica) {
2403
+ log.debug("{}: on_{s}: ignoring (view change, received by follower)", .{
2404
+ self.replica,
2405
+ command,
2406
+ });
2407
+ return true;
2408
+ } else if (!self.do_view_change_quorum) {
2409
+ log.debug("{}: on_{s}: ignoring (view change, waiting for quorum)", .{
2410
+ self.replica,
2411
+ command,
2412
+ });
2413
+ return true;
2414
+ }
2415
+ },
2416
+ else => unreachable,
2417
+ }
2418
+
2419
+ return false;
2420
+ }
2421
+
2422
+ fn ignore_request_message(self: *Self, message: *Message) bool {
2423
+ assert(message.header.command == .request);
2424
+
2425
+ if (self.status != .normal) {
2426
+ log.debug("{}: on_request: ignoring ({s})", .{ self.replica, self.status });
2427
+ return true;
2428
+ }
2429
+
2430
+ if (self.ignore_request_message_follower(message)) return true;
2431
+ if (self.ignore_request_message_duplicate(message)) return true;
2432
+ if (self.ignore_request_message_preparing(message)) return true;
2433
+ return false;
2434
+ }
2435
+
2436
+ /// Returns whether the request is stale, or a duplicate of the latest committed request.
2437
+ /// Resends the reply to the latest request if the request has been committed.
2438
+ fn ignore_request_message_duplicate(self: *Self, message: *const Message) bool {
2439
+ assert(self.status == .normal);
2440
+ assert(self.leader());
2441
+
2442
+ assert(message.header.command == .request);
2443
+ assert(message.header.client > 0);
2444
+ assert(message.header.view <= self.view); // See ignore_request_message_follower().
2445
+ assert(message.header.context == 0 or message.header.operation != .register);
2446
+ assert(message.header.request == 0 or message.header.operation != .register);
2447
+
2448
+ if (self.client_table.getPtr(message.header.client)) |entry| {
2449
+ assert(entry.reply.header.command == .reply);
2450
+ assert(entry.reply.header.client == message.header.client);
2451
+
2452
+ if (message.header.operation == .register) {
2453
+ // Fall through below to check if we should resend the .register session reply.
2454
+ } else if (entry.session > message.header.context) {
2455
+ // The client must not reuse the ephemeral client ID when registering a new session.
2456
+ log.alert("{}: on_request: ignoring older session (client bug)", .{self.replica});
2457
+ return true;
2458
+ } else if (entry.session < message.header.context) {
2459
+ // This cannot be because of a partition since we check the client's view number.
2460
+ log.alert("{}: on_request: ignoring newer session (client bug)", .{self.replica});
2461
+ return true;
2462
+ }
2463
+
2464
+ if (entry.reply.header.request > message.header.request) {
2465
+ log.debug("{}: on_request: ignoring older request", .{self.replica});
2466
+ return true;
2467
+ } else if (entry.reply.header.request == message.header.request) {
2468
+ if (message.header.checksum == entry.reply.header.parent) {
2469
+ assert(entry.reply.header.operation == message.header.operation);
2470
+
2471
+ log.debug("{}: on_request: replying to duplicate request", .{self.replica});
2472
+ self.message_bus.send_message_to_client(message.header.client, entry.reply);
2473
+ return true;
2474
+ } else {
2475
+ log.alert("{}: on_request: request collision (client bug)", .{self.replica});
2476
+ return true;
2477
+ }
2478
+ } else if (entry.reply.header.request + 1 == message.header.request) {
2479
+ if (message.header.parent == entry.reply.header.checksum) {
2480
+ // The client has proved that they received our last reply.
2481
+ log.debug("{}: on_request: new request", .{self.replica});
2482
+ return false;
2483
+ } else {
2484
+ // The client may have only one request inflight at a time.
2485
+ log.alert("{}: on_request: ignoring new request (client bug)", .{self.replica});
2486
+ return true;
2487
+ }
2488
+ } else {
2489
+ log.alert("{}: on_request: ignoring newer request (client bug)", .{self.replica});
2490
+ return true;
2491
+ }
2492
+ } else if (message.header.operation == .register) {
2493
+ log.debug("{}: on_request: new session", .{self.replica});
2494
+ return false;
2495
+ } else if (self.pipeline_prepare_for_client(message.header.client)) |_| {
2496
+ // The client registered with the previous leader, which committed and replied back
2497
+ // to the client before the view change, after which the register operation was
2498
+ // reloaded into the pipeline to be driven to completion by the new leader, which
2499
+ // now receives a request from the client that appears to have no session.
2500
+ // However, the session is about to be registered, so we must wait for it to commit.
2501
+ log.debug("{}: on_request: waiting for session to commit", .{self.replica});
2502
+ return true;
2503
+ } else {
2504
+ // We must have all commits to know whether a session has been evicted. For example,
2505
+ // there is the risk of sending an eviction message (even as the leader) if we are
2506
+ // partitioned and don't yet know about a session. We solve this by having clients
2507
+ // include the view number and rejecting messages from clients with newer views.
2508
+ log.err("{}: on_request: no session", .{self.replica});
2509
+ self.send_eviction_message_to_client(message.header.client);
2510
+ return true;
2511
+ }
2512
+ }
2513
+
2514
+ /// Returns whether the replica is eligible to process this request as the leader.
2515
+ /// Takes the client's perspective into account if the client is aware of a newer view.
2516
+ /// Forwards requests to the leader if the client has an older view.
2517
+ fn ignore_request_message_follower(self: *Self, message: *Message) bool {
2518
+ assert(self.status == .normal);
2519
+ assert(message.header.command == .request);
2520
+
2521
+ // The client is aware of a newer view:
2522
+ // Even if we think we are the leader, we may be partitioned from the rest of the cluster.
2523
+ // We therefore drop the message rather than flood our partition with traffic.
2524
+ if (message.header.view > self.view) {
2525
+ log.debug("{}: on_request: ignoring (newer view)", .{self.replica});
2526
+ return true;
2527
+ } else if (self.leader()) {
2528
+ return false;
2529
+ }
2530
+
2531
+ if (message.header.operation == .register) {
2532
+ // We do not forward `.register` requests for the sake of `Header.peer_type()`.
2533
+ // This enables the MessageBus to identify client connections on the first message.
2534
+ log.debug("{}: on_request: ignoring (follower, register)", .{self.replica});
2535
+ } else if (message.header.view < self.view) {
2536
+ // The client may not know who the leader is, or may be retrying after a leader failure.
2537
+ // We forward to the new leader ahead of any client retry timeout to reduce latency.
2538
+ // Since the client is already connected to all replicas, the client may yet receive the
2539
+ // reply from the new leader directly.
2540
+ log.debug("{}: on_request: forwarding (follower)", .{self.replica});
2541
+ self.send_message_to_replica(self.leader_index(self.view), message);
2542
+ } else {
2543
+ assert(message.header.view == self.view);
2544
+ // The client has the correct view, but has retried against a follower.
2545
+ // This may mean that the leader is down and that we are about to do a view change.
2546
+ // There is also not much we can do as the client already knows who the leader is.
2547
+ // We do not forward as this would amplify traffic on the network.
2548
+
2549
+ // TODO This may also indicate a client-leader partition. If we see enough of these,
2550
+ // should we trigger a view change to select a leader that clients can reach?
2551
+ // This is a question of weighing the probability of a partition vs routing error.
2552
+ log.debug("{}: on_request: ignoring (follower, same view)", .{self.replica});
2553
+ }
2554
+
2555
+ assert(self.follower());
2556
+ return true;
2557
+ }
2558
+
2559
+ fn ignore_request_message_preparing(self: *Self, message: *const Message) bool {
2560
+ assert(self.status == .normal);
2561
+ assert(self.leader());
2562
+
2563
+ assert(message.header.command == .request);
2564
+ assert(message.header.client > 0);
2565
+ assert(message.header.view <= self.view); // See ignore_request_message_follower().
2566
+
2567
+ if (self.pipeline_prepare_for_client(message.header.client)) |prepare| {
2568
+ assert(prepare.message.header.command == .prepare);
2569
+ assert(prepare.message.header.client == message.header.client);
2570
+ assert(prepare.message.header.op > self.commit_max);
2571
+
2572
+ if (message.header.checksum == prepare.message.header.context) {
2573
+ log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
2574
+ return true;
2575
+ } else {
2576
+ log.alert("{}: on_request: ignoring (client forked)", .{self.replica});
2577
+ return true;
2578
+ }
2579
+ }
2580
+
2581
+ if (self.pipeline.full()) {
2582
+ log.debug("{}: on_request: ignoring (pipeline full)", .{self.replica});
2583
+ return true;
2584
+ }
2585
+
2586
+ return false;
2587
+ }
2588
+
2589
+ fn ignore_view_change_message(self: *Self, message: *const Message) bool {
2590
+ assert(message.header.command == .start_view_change or
2591
+ message.header.command == .do_view_change or
2592
+ message.header.command == .start_view);
2593
+ assert(message.header.view > 0); // The initial view is already zero.
2594
+
2595
+ const command: []const u8 = @tagName(message.header.command);
2596
+
2597
+ // 4.3 Recovery
2598
+ // While a replica's status is recovering it does not participate in either the request
2599
+ // processing protocol or the view change protocol.
2600
+ // This is critical for correctness (to avoid data loss):
2601
+ if (self.status == .recovering) {
2602
+ log.debug("{}: on_{s}: ignoring (recovering)", .{ self.replica, command });
2603
+ return true;
2604
+ }
2605
+
2606
+ if (message.header.view < self.view) {
2607
+ log.debug("{}: on_{s}: ignoring (older view)", .{ self.replica, command });
2608
+ return true;
2609
+ }
2610
+
2611
+ if (message.header.view == self.view and self.status == .normal) {
2612
+ log.debug("{}: on_{s}: ignoring (view started)", .{ self.replica, command });
2613
+ return true;
2614
+ }
2615
+
2616
+ // These may be caused by faults in the network topology.
2617
+ switch (message.header.command) {
2618
+ .start_view_change, .start_view => {
2619
+ if (message.header.replica == self.replica) {
2620
+ log.warn("{}: on_{s}: ignoring (self)", .{ self.replica, command });
2621
+ return true;
2622
+ }
2623
+ },
2624
+ .do_view_change => {
2625
+ if (self.leader_index(message.header.view) != self.replica) {
2626
+ log.warn("{}: on_{s}: ignoring (follower)", .{ self.replica, command });
2627
+ return true;
2628
+ }
2629
+ },
2630
+ else => unreachable,
2631
+ }
2632
+
2633
+ return false;
2634
+ }
2635
+
2636
+ fn is_repair(self: *Self, message: *const Message) bool {
2637
+ assert(message.header.command == .prepare);
2638
+
2639
+ if (self.status == .normal) {
2640
+ if (message.header.view < self.view) return true;
2641
+ if (message.header.view == self.view and message.header.op <= self.op) return true;
2642
+ } else if (self.status == .view_change) {
2643
+ if (message.header.view < self.view) return true;
2644
+ // The view has already started or is newer.
2645
+ }
2646
+
2647
+ return false;
2648
+ }
2649
+
2650
+ /// Returns whether the replica is the leader for the current view.
2651
+ /// This may be used only when the replica status is normal.
2652
+ fn leader(self: *Self) bool {
2653
+ assert(self.status == .normal);
2654
+ return self.leader_index(self.view) == self.replica;
2655
+ }
2656
+
2657
+ /// Returns the index into the configuration of the leader for a given view.
2658
+ fn leader_index(self: *Self, view: u32) u8 {
2659
+ return @intCast(u8, @mod(view, self.replica_count));
2660
+ }
2661
+
2662
+ /// Advances `op` to where we need to be before `header` can be processed as a prepare:
2663
+ fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
2664
+ assert(self.status == .normal);
2665
+ assert(self.follower());
2666
+ assert(header.view == self.view);
2667
+ assert(header.op > self.op + 1);
2668
+ // We may have learned of a higher `commit_max` through a commit message before jumping to a
2669
+ // newer op that is less than `commit_max` but greater than `commit_min`:
2670
+ assert(header.op > self.commit_min);
2671
+
2672
+ log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
2673
+ self.replica,
2674
+ self.op,
2675
+ header.op - 1,
2676
+ self.journal.entry_for_op_exact(self.op).?.checksum,
2677
+ header.parent,
2678
+ });
2679
+
2680
+ self.op = header.op - 1;
2681
+ assert(self.op >= self.commit_min);
2682
+ assert(self.op + 1 == header.op);
2683
+ }
2684
+
2685
+ fn message_body_as_headers(self: *Self, message: *const Message) []Header {
2686
+ // TODO Assert message commands that we expect this to be called for.
2687
+ assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
2688
+ return std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..message.header.size]);
2689
+ }
2690
+
2691
+ /// Panics if immediate neighbors in the same view would have a broken hash chain.
2692
+ /// Assumes gaps and does not require that a preceeds b.
2693
+ fn panic_if_hash_chain_would_break_in_the_same_view(
2694
+ self: *Self,
2695
+ a: *const Header,
2696
+ b: *const Header,
2697
+ ) void {
2698
+ assert(a.command == .prepare);
2699
+ assert(b.command == .prepare);
2700
+ assert(a.cluster == b.cluster);
2701
+ if (a.view == b.view and a.op + 1 == b.op and a.checksum != b.parent) {
2702
+ assert(a.valid_checksum());
2703
+ assert(b.valid_checksum());
2704
+ log.emerg("{}: panic_if_hash_chain_would_break: a: {}", .{ self.replica, a });
2705
+ log.emerg("{}: panic_if_hash_chain_would_break: b: {}", .{ self.replica, b });
2706
+ @panic("hash chain would break");
2707
+ }
2708
+ }
2709
+
2710
+ /// Searches the pipeline for a prepare for a given client.
2711
+ fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
2712
+ assert(self.status == .normal);
2713
+ assert(self.leader());
2714
+ assert(self.commit_min == self.commit_max);
2715
+
2716
+ var op = self.commit_max + 1;
2717
+ var parent = self.journal.entry_for_op_exact(self.commit_max).?.checksum;
2718
+ var iterator = self.pipeline.iterator();
2719
+ while (iterator.next_ptr()) |prepare| {
2720
+ assert(prepare.message.header.command == .prepare);
2721
+ assert(prepare.message.header.op == op);
2722
+ assert(prepare.message.header.parent == parent);
2723
+
2724
+ // A client may have multiple requests in the pipeline if these were committed by
2725
+ // the previous leader and were reloaded into the pipeline after a view change.
2726
+ if (prepare.message.header.client == client) return prepare;
2727
+
2728
+ parent = prepare.message.header.checksum;
2729
+ op += 1;
2730
+ }
2731
+
2732
+ assert(self.pipeline.count <= config.pipelining_max);
2733
+ assert(self.commit_max + self.pipeline.count == op - 1);
2734
+ assert(self.commit_max + self.pipeline.count == self.op);
2735
+
2736
+ return null;
2737
+ }
2738
+
2739
+ /// Searches the pipeline for a prepare for a given client and checksum.
2740
+ /// Passing the prepare_ok message prevents these u128s from being accidentally swapped.
2741
+ /// Asserts that the returned prepare, if any, exactly matches the prepare_ok.
2742
+ fn pipeline_prepare_for_prepare_ok(self: *Self, ok: *const Message) ?*Prepare {
2743
+ assert(ok.header.command == .prepare_ok);
2744
+
2745
+ assert(self.status == .normal);
2746
+ assert(self.leader());
2747
+
2748
+ const prepare = self.pipeline_prepare_for_client(ok.header.client) orelse {
2749
+ log.debug("{}: pipeline_prepare_for_prepare_ok: not preparing", .{self.replica});
2750
+ return null;
2751
+ };
2752
+
2753
+ if (ok.header.context != prepare.message.header.checksum) {
2754
+ // This can be normal, for example, if an old prepare_ok is replayed.
2755
+ log.debug("{}: pipeline_prepare_for_prepare_ok: preparing a different client op", .{
2756
+ self.replica,
2757
+ });
2758
+ return null;
2759
+ }
2760
+
2761
+ assert(prepare.message.header.parent == ok.header.parent);
2762
+ assert(prepare.message.header.client == ok.header.client);
2763
+ assert(prepare.message.header.request == ok.header.request);
2764
+ assert(prepare.message.header.cluster == ok.header.cluster);
2765
+ assert(prepare.message.header.epoch == ok.header.epoch);
2766
+ // A prepare may be committed in the same view or in a newer view:
2767
+ assert(prepare.message.header.view <= ok.header.view);
2768
+ assert(prepare.message.header.op == ok.header.op);
2769
+ assert(prepare.message.header.commit == ok.header.commit);
2770
+ assert(prepare.message.header.offset == ok.header.offset);
2771
+ assert(prepare.message.header.operation == ok.header.operation);
2772
+
2773
+ return prepare;
2774
+ }
2775
+
2776
+ /// Starting from the latest journal entry, backfill any missing or disconnected headers.
2777
+ /// A header is disconnected if it breaks the hash chain with its newer neighbor to the right.
2778
+ /// Since we work backwards from the latest entry, we should always be able to fix the chain.
2779
+ /// Once headers are connected, backfill any dirty or faulty prepares.
2780
+ fn repair(self: *Self) void {
2781
+ if (!self.repair_timeout.ticking) {
2782
+ log.debug("{}: repair: ignoring (optimistic, not ticking)", .{self.replica});
2783
+ return;
2784
+ }
2785
+
2786
+ self.repair_timeout.reset();
2787
+
2788
+ assert(self.status == .normal or self.status == .view_change);
2789
+ assert(self.repairs_allowed());
2790
+ assert(self.commit_min <= self.op);
2791
+ assert(self.commit_min <= self.commit_max);
2792
+
2793
+ // We expect these always to exist:
2794
+ assert(self.journal.entry_for_op_exact(self.commit_min) != null);
2795
+ assert(self.journal.entry_for_op_exact(self.op) != null);
2796
+
2797
+ // Request outstanding committed prepares to advance our op number:
2798
+ // This handles the case of an idle cluster, where a follower will not otherwise advance.
2799
+ // This is not required for correctness, but for durability.
2800
+ if (self.op < self.commit_max) {
2801
+ // If the leader repairs during a view change, it will have already advanced
2802
+ // `self.op` to the latest op according to the quorum of `do_view_change` messages
2803
+ // received, so we must therefore be a follower in normal status:
2804
+ assert(self.status == .normal);
2805
+ assert(self.follower());
2806
+ log.debug("{}: repair: op={} < commit_max={}", .{
2807
+ self.replica,
2808
+ self.op,
2809
+ self.commit_max,
2810
+ });
2811
+ // We need to advance our op number and therefore have to `request_prepare`,
2812
+ // since only `on_prepare()` can do this, not `repair_header()` in `on_headers()`.
2813
+ self.send_header_to_replica(self.leader_index(self.view), .{
2814
+ .command = .request_prepare,
2815
+ // We cannot yet know the checksum of the prepare so we set the context to 0:
2816
+ // Context is optional when requesting from the leader but required otherwise.
2817
+ .context = 0,
2818
+ .cluster = self.cluster,
2819
+ .replica = self.replica,
2820
+ .view = self.view,
2821
+ .op = self.commit_max,
2822
+ });
2823
+ return;
2824
+ }
2825
+
2826
+ // Request any missing or disconnected headers:
2827
+ // TODO Snapshots: Ensure that self.commit_min op always exists in the journal.
2828
+ var broken = self.journal.find_latest_headers_break_between(self.commit_min, self.op);
2829
+ if (broken) |range| {
2830
+ log.debug("{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})", .{
2831
+ self.replica,
2832
+ self.view,
2833
+ range.op_min,
2834
+ range.op_max,
2835
+ self.commit_min,
2836
+ self.commit_max,
2837
+ self.op,
2838
+ });
2839
+ assert(range.op_min > self.commit_min);
2840
+ assert(range.op_max < self.op);
2841
+ // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
2842
+ // This is the init op that is prepared when the cluster is initialized.
2843
+ assert(range.op_min > 0);
2844
+ assert(range.op_max > 0);
2845
+ if (self.choose_any_other_replica()) |replica| {
2846
+ self.send_header_to_replica(replica, .{
2847
+ .command = .request_headers,
2848
+ .cluster = self.cluster,
2849
+ .replica = self.replica,
2850
+ .view = self.view,
2851
+ .commit = range.op_min,
2852
+ .op = range.op_max,
2853
+ });
2854
+ }
2855
+ return;
2856
+ }
2857
+
2858
+ // Assert that all headers are now present and connected with a perfect hash chain:
2859
+ assert(self.op >= self.commit_max);
2860
+ assert(self.valid_hash_chain_between(self.commit_min, self.op));
2861
+
2862
+ // Request and repair any dirty or faulty prepares:
2863
+ if (self.journal.dirty.len > 0) return self.repair_prepares();
2864
+
2865
+ // Commit ops, which may in turn discover faulty prepares and drive more repairs:
2866
+ if (self.commit_min < self.commit_max) return self.commit_ops(self.commit_max);
2867
+
2868
+ if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
2869
+ if (self.repair_pipeline_op() != null) return self.repair_pipeline();
2870
+
2871
+ // Start the view as the new leader:
2872
+ self.start_view_as_the_new_leader();
2873
+ }
2874
+ }
2875
+
2876
+ /// Decide whether or not to insert or update a header:
2877
+ ///
2878
+ /// A repair may never advance or replace `self.op` (critical for correctness):
2879
+ ///
2880
+ /// Repairs must always backfill in behind `self.op` but may never advance `self.op`.
2881
+ /// Otherwise, a split-brain leader may reapply an op that was removed through a view
2882
+ /// change, which could be committed by a higher `commit_max` number in a commit message.
2883
+ ///
2884
+ /// See this commit message for an example:
2885
+ /// https://github.com/coilhq/tigerbeetle/commit/6119c7f759f924d09c088422d5c60ac6334d03de
2886
+ ///
2887
+ /// Our guiding principles around repairs in general:
2888
+ ///
2889
+ /// * The latest op makes sense of everything else and must not be replaced with a different
2890
+ /// op or advanced except by the leader in the current view.
2891
+ ///
2892
+ /// * Do not jump to a view in normal status without receiving a start_view message.
2893
+ ///
2894
+ /// * Do not commit until the hash chain between `self.commit_min` and `self.op` is fully
2895
+ /// connected, to ensure that all the ops in this range are correct.
2896
+ ///
2897
+ /// * Ensure that `self.commit_max` is never advanced for a newer view without first
2898
+ /// receiving a start_view message, otherwise `self.commit_max` may refer to different ops.
2899
+ ///
2900
+ /// * Ensure that `self.op` is never advanced by a repair since repairs may occur in a view
2901
+ /// change where the view has not yet started.
2902
+ ///
2903
+ /// * Do not assume that an existing op with a older viewstamp can be replaced by an op with
2904
+ /// a newer viewstamp, but only compare ops in the same view or with reference to the chain.
2905
+ /// See Figure 3.7 on page 41 in Diego Ongaro's Raft thesis for an example of where an op
2906
+ /// with an older view number may be committed instead of an op with a newer view number:
2907
+ /// http://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf.
2908
+ ///
2909
+ fn repair_header(self: *Self, header: *const Header) bool {
2910
+ assert(header.valid_checksum());
2911
+ assert(header.invalid() == null);
2912
+ assert(header.command == .prepare);
2913
+
2914
+ switch (self.status) {
2915
+ .normal => assert(header.view <= self.view),
2916
+ .view_change => assert(header.view <= self.view),
2917
+ else => unreachable,
2918
+ }
2919
+
2920
+ if (header.op > self.op) {
2921
+ log.debug("{}: repair_header: false (advances self.op)", .{self.replica});
2922
+ return false;
2923
+ } else if (header.op == self.op) {
2924
+ if (self.journal.entry_for_op_exact_with_checksum(self.op, header.checksum)) |_| {
2925
+ // Fall through below to check if self.op is uncommitted AND reordered,
2926
+ // which we would see by the presence of an earlier op with higher view number,
2927
+ // that breaks the chain with self.op. In this case, we must skip the repair to
2928
+ // avoid overwriting any overlapping op.
2929
+ } else {
2930
+ log.debug("{}: repair_header: false (changes self.op={})", .{
2931
+ self.replica,
2932
+ self.op,
2933
+ });
2934
+ return false;
2935
+ }
2936
+ }
2937
+
2938
+ if (self.journal.entry(header)) |existing| {
2939
+ // Do not replace any existing op lightly as doing so may impair durability and even
2940
+ // violate correctness by undoing a prepare already acknowledged to the leader:
2941
+ if (existing.checksum == header.checksum) {
2942
+ if (!self.journal.dirty.bit(header.op)) {
2943
+ log.debug("{}: repair_header: false (checksum clean)", .{self.replica});
2944
+ return false;
2945
+ }
2946
+
2947
+ log.debug("{}: repair_header: exists, checksum dirty", .{self.replica});
2948
+ } else if (existing.view == header.view) {
2949
+ // The journal must have wrapped:
2950
+ // We expect that the same view and op will have the same checksum.
2951
+ assert(existing.op != header.op);
2952
+
2953
+ if (existing.op > header.op) {
2954
+ log.debug("{}: repair_header: false (view has newer op)", .{self.replica});
2955
+ return false;
2956
+ }
2957
+
2958
+ log.debug("{}: repair_header: exists, view has older op", .{self.replica});
2959
+ } else {
2960
+ assert(existing.view != header.view);
2961
+ assert(existing.op == header.op or existing.op != header.op);
2962
+
2963
+ if (!self.repair_header_would_connect_hash_chain(header)) {
2964
+ // We cannot replace this op until we are sure that doing so would not
2965
+ // violate any prior commitments made to the leader.
2966
+ log.debug("{}: repair_header: false (exists)", .{self.replica});
2967
+ return false;
2968
+ }
2969
+
2970
+ log.debug("{}: repair_header: exists, connects hash chain", .{self.replica});
2971
+ }
2972
+ } else {
2973
+ log.debug("{}: repair_header: gap", .{self.replica});
2974
+ }
2975
+
2976
+ // Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
2977
+ if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
2978
+ log.debug("{}: repair_header: false (breaks hash chain)", .{self.replica});
2979
+ return false;
2980
+ }
2981
+
2982
+ // Caveat: Do not repair an existing op or gap if doing so would overlap another:
2983
+ if (self.repair_header_would_overlap_another(header)) {
2984
+ if (!self.repair_header_would_connect_hash_chain(header)) {
2985
+ log.debug("{}: repair_header: false (overlap)", .{self.replica});
2986
+ return false;
2987
+ }
2988
+ // We may have to overlap previous entries in order to connect the hash chain:
2989
+ log.debug("{}: repair_header: overlap, connects hash chain", .{self.replica});
2990
+ }
2991
+
2992
+ // TODO Snapshots: Skip if this header is already snapshotted.
2993
+
2994
+ assert(header.op < self.op or
2995
+ self.journal.entry_for_op_exact(self.op).?.checksum == header.checksum);
2996
+
2997
+ self.journal.set_entry_as_dirty(header);
2998
+ return true;
2999
+ }
3000
+
3001
+ /// If we repair this header, then would this break the hash chain only to our immediate right?
3002
+ /// This offers a weak guarantee compared to `repair_header_would_connect_hash_chain()` below.
3003
+ /// However, this is useful for allowing repairs when the hash chain is sparse.
3004
+ fn repair_header_would_break_hash_chain_with_next_entry(
3005
+ self: *Self,
3006
+ header: *const Header,
3007
+ ) bool {
3008
+ if (self.journal.previous_entry(header)) |previous| {
3009
+ self.panic_if_hash_chain_would_break_in_the_same_view(previous, header);
3010
+ }
3011
+
3012
+ if (self.journal.next_entry(header)) |next| {
3013
+ self.panic_if_hash_chain_would_break_in_the_same_view(header, next);
3014
+
3015
+ if (header.checksum == next.parent) {
3016
+ assert(header.view <= next.view);
3017
+ assert(header.op + 1 == next.op);
3018
+ // We don't break with `next` but this is no guarantee that `next` does not break.
3019
+ return false;
3020
+ } else {
3021
+ // If the journal has wrapped, then err in favor of a break regardless of op order:
3022
+ return true;
3023
+ }
3024
+ }
3025
+
3026
+ // We are not completely sure since there is no entry to the immediate right:
3027
+ return false;
3028
+ }
3029
+
3030
+ /// If we repair this header, then would this connect the hash chain through to the latest op?
3031
+ /// This offers a strong guarantee that may be used to replace or overlap an existing op.
3032
+ ///
3033
+ /// Here is an example of what could go wrong if we did not check for complete connection:
3034
+ ///
3035
+ /// 1. We do a prepare that's going to be committed.
3036
+ /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the left.
3037
+ /// 3. We do another stale prepare that replaces the first op because it connects to the second.
3038
+ ///
3039
+ /// This would violate our quorum replication commitment to the leader.
3040
+ /// The mistake in this example was not that we ignored the break to the left, which we must
3041
+ /// do to repair reordered ops, but that we did not check for connection to the right.
3042
+ fn repair_header_would_connect_hash_chain(self: *Self, header: *const Header) bool {
3043
+ var entry = header;
3044
+
3045
+ while (entry.op < self.op) {
3046
+ if (self.journal.next_entry(entry)) |next| {
3047
+ if (entry.checksum == next.parent) {
3048
+ assert(entry.view <= next.view);
3049
+ assert(entry.op + 1 == next.op);
3050
+ entry = next;
3051
+ } else {
3052
+ return false;
3053
+ }
3054
+ } else {
3055
+ return false;
3056
+ }
3057
+ }
3058
+
3059
+ assert(entry.op == self.op);
3060
+ assert(entry.checksum == self.journal.entry_for_op_exact(self.op).?.checksum);
3061
+ return true;
3062
+ }
3063
+
3064
+ /// If we repair this header, then would this overlap and overwrite part of another batch?
3065
+ /// Journal entries have variable-sized batches that may overlap if entries are disconnected.
3066
+ fn repair_header_would_overlap_another(self: *Self, header: *const Header) bool {
3067
+ // TODO Snapshots: Handle journal wrap around.
3068
+ {
3069
+ // Look behind this entry for any preceeding entry that this would overlap:
3070
+ var op: u64 = header.op;
3071
+ while (op > 0) {
3072
+ op -= 1;
3073
+ if (self.journal.entry_for_op(op)) |neighbor| {
3074
+ if (self.journal.next_offset(neighbor) > header.offset) return true;
3075
+ break;
3076
+ }
3077
+ }
3078
+ }
3079
+ {
3080
+ // Look beyond this entry for any succeeding entry that this would overlap:
3081
+ var op: u64 = header.op + 1;
3082
+ while (op <= self.op) : (op += 1) {
3083
+ if (self.journal.entry_for_op(op)) |neighbor| {
3084
+ if (self.journal.next_offset(header) > neighbor.offset) return true;
3085
+ break;
3086
+ }
3087
+ }
3088
+ }
3089
+ return false;
3090
+ }
3091
+
3092
+ /// Reads prepares into the pipeline (before we start the view as the new leader).
3093
+ fn repair_pipeline(self: *Self) void {
3094
+ assert(self.status == .view_change);
3095
+ assert(self.leader_index(self.view) == self.replica);
3096
+ assert(self.commit_max < self.op);
3097
+
3098
+ if (self.repairing_pipeline) {
3099
+ log.debug("{}: repair_pipeline: already repairing...", .{self.replica});
3100
+ return;
3101
+ }
3102
+
3103
+ log.debug("{}: repair_pipeline: repairing", .{self.replica});
3104
+
3105
+ assert(!self.repairing_pipeline);
3106
+ self.repairing_pipeline = true;
3107
+
3108
+ self.repair_pipeline_read();
3109
+ }
3110
+
3111
+ /// Returns the next `op` number that needs to be read into the pipeline.
3112
+ fn repair_pipeline_op(self: *Self) ?u64 {
3113
+ assert(self.status == .view_change);
3114
+ assert(self.leader_index(self.view) == self.replica);
3115
+
3116
+ const op = self.commit_max + self.pipeline.count + 1;
3117
+ if (op <= self.op) return op;
3118
+
3119
+ assert(self.commit_max + self.pipeline.count == self.op);
3120
+ return null;
3121
+ }
3122
+
3123
+ fn repair_pipeline_read(self: *Self) void {
3124
+ assert(self.repairing_pipeline);
3125
+ assert(self.status == .view_change);
3126
+ assert(self.leader_index(self.view) == self.replica);
3127
+
3128
+ if (self.repair_pipeline_op()) |op| {
3129
+ assert(op > self.commit_max);
3130
+ assert(op <= self.op);
3131
+ assert(self.commit_max + self.pipeline.count + 1 == op);
3132
+
3133
+ const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3134
+
3135
+ log.debug("{}: repair_pipeline_read: op={} checksum={}", .{
3136
+ self.replica,
3137
+ op,
3138
+ checksum,
3139
+ });
3140
+
3141
+ self.journal.read_prepare(repair_pipeline_push, op, checksum, null);
3142
+ } else {
3143
+ log.debug("{}: repair_pipeline_read: repaired", .{self.replica});
3144
+ self.repairing_pipeline = false;
3145
+ self.repair();
3146
+ }
3147
+ }
3148
+
3149
+ fn repair_pipeline_push(
3150
+ self: *Self,
3151
+ prepare: ?*Message,
3152
+ destination_replica: ?u8,
3153
+ ) void {
3154
+ assert(destination_replica == null);
3155
+
3156
+ assert(self.repairing_pipeline);
3157
+ self.repairing_pipeline = false;
3158
+
3159
+ if (prepare == null) {
3160
+ log.debug("{}: repair_pipeline_push: prepare == null", .{self.replica});
3161
+ return;
3162
+ }
3163
+
3164
+ // Our state may have advanced significantly while we were reading from disk.
3165
+ if (self.status != .view_change) {
3166
+ log.debug("{}: repair_pipeline_push: no longer in view change status", .{
3167
+ self.replica,
3168
+ });
3169
+ return;
3170
+ }
3171
+
3172
+ if (self.leader_index(self.view) != self.replica) {
3173
+ log.debug("{}: repair_pipeline_push: no longer leader", .{self.replica});
3174
+ return;
3175
+ }
3176
+
3177
+ // We may even be several views ahead and may now have a completely different pipeline.
3178
+ const op = self.repair_pipeline_op() orelse {
3179
+ log.debug("{}: repair_pipeline_push: pipeline changed", .{self.replica});
3180
+ return;
3181
+ };
3182
+
3183
+ assert(op > self.commit_max);
3184
+ assert(op <= self.op);
3185
+ assert(self.commit_max + self.pipeline.count + 1 == op);
3186
+
3187
+ if (prepare.?.header.op != op) {
3188
+ log.debug("{}: repair_pipeline_push: op changed", .{self.replica});
3189
+ return;
3190
+ }
3191
+
3192
+ if (prepare.?.header.checksum != self.journal.entry_for_op_exact(op).?.checksum) {
3193
+ log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
3194
+ return;
3195
+ }
3196
+
3197
+ assert(self.status == .view_change);
3198
+ assert(self.leader_index(self.view) == self.replica);
3199
+
3200
+ log.debug("{}: repair_pipeline_push: op={} checksum={}", .{
3201
+ self.replica,
3202
+ prepare.?.header.op,
3203
+ prepare.?.header.checksum,
3204
+ });
3205
+
3206
+ self.pipeline.push(.{ .message = prepare.?.ref() }) catch unreachable;
3207
+ assert(self.pipeline.count >= 1);
3208
+
3209
+ self.repairing_pipeline = true;
3210
+ self.repair_pipeline_read();
3211
+ }
3212
+
3213
+ fn repair_prepares(self: *Self) void {
3214
+ assert(self.status == .normal or self.status == .view_change);
3215
+ assert(self.repairs_allowed());
3216
+ assert(self.journal.dirty.len > 0);
3217
+
3218
+ // Request enough prepares to utilize our max IO depth:
3219
+ var budget = self.journal.writes.available();
3220
+ if (budget == 0) {
3221
+ log.debug("{}: repair_prepares: waiting for IOP", .{self.replica});
3222
+ return;
3223
+ }
3224
+
3225
+ var op = self.op + 1;
3226
+ while (op > 0) {
3227
+ op -= 1;
3228
+
3229
+ if (self.journal.dirty.bit(op)) {
3230
+ // If this is an uncommitted op, and we are the leader in `view_change` status,
3231
+ // then we will `request_prepare` from the cluster, set `nack_prepare_op`,
3232
+ // and stop repairing any further prepares:
3233
+ // This will also rebroadcast any `request_prepare` every `repair_timeout` tick.
3234
+ if (self.repair_prepare(op)) {
3235
+ if (self.nack_prepare_op) |nack_prepare_op| {
3236
+ assert(nack_prepare_op == op);
3237
+ assert(self.status == .view_change);
3238
+ assert(self.leader_index(self.view) == self.replica);
3239
+ assert(op > self.commit_max);
3240
+ return;
3241
+ }
3242
+
3243
+ // Otherwise, we continue to request prepares until our budget is used:
3244
+ budget -= 1;
3245
+ if (budget == 0) {
3246
+ log.debug("{}: repair_prepares: request budget used", .{self.replica});
3247
+ return;
3248
+ }
3249
+ }
3250
+ } else {
3251
+ assert(!self.journal.faulty.bit(op));
3252
+ }
3253
+ }
3254
+ }
3255
+
3256
+ /// During a view change, for uncommitted ops, which are few, we optimize for latency:
3257
+ ///
3258
+ /// * request a `prepare` or `nack_prepare` from all followers in parallel,
3259
+ /// * repair as soon as we get a `prepare`, or
3260
+ /// * discard as soon as we get a majority of `nack_prepare` messages for the same checksum.
3261
+ ///
3262
+ /// For committed ops, which represent the bulk of ops, we optimize for throughput:
3263
+ ///
3264
+ /// * have multiple requests in flight to prime the repair queue,
3265
+ /// * rotate these requests across the cluster round-robin,
3266
+ /// * to spread the load across connected peers,
3267
+ /// * to take advantage of each peer's outgoing bandwidth, and
3268
+ /// * to parallelize disk seeks and disk read bandwidth.
3269
+ ///
3270
+ /// This is effectively "many-to-one" repair, where a single replica recovers using the
3271
+ /// resources of many replicas, for faster recovery.
3272
+ fn repair_prepare(self: *Self, op: u64) bool {
3273
+ assert(self.status == .normal or self.status == .view_change);
3274
+ assert(self.repairs_allowed());
3275
+ assert(self.journal.dirty.bit(op));
3276
+
3277
+ const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3278
+
3279
+ // We may be appending to or repairing the journal concurrently.
3280
+ // We do not want to re-request any of these prepares unnecessarily.
3281
+ if (self.journal.writing(op, checksum)) {
3282
+ log.debug("{}: repair_prepare: already writing op={} checksum={}", .{
3283
+ self.replica,
3284
+ op,
3285
+ checksum,
3286
+ });
3287
+ return false;
3288
+ }
3289
+
3290
+ const request_prepare = Header{
3291
+ .command = .request_prepare,
3292
+ // If we request a prepare from a follower, as below, it is critical to pass a checksum:
3293
+ // Otherwise we could receive different prepares for the same op number.
3294
+ .context = checksum,
3295
+ .cluster = self.cluster,
3296
+ .replica = self.replica,
3297
+ .view = self.view,
3298
+ .op = op,
3299
+ };
3300
+
3301
+ if (self.status == .view_change and op > self.commit_max) {
3302
+ // Only the leader is allowed to do repairs in a view change:
3303
+ assert(self.leader_index(self.view) == self.replica);
3304
+
3305
+ const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
3306
+ log.debug(
3307
+ "{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
3308
+ .{
3309
+ self.replica,
3310
+ op,
3311
+ checksum,
3312
+ reason,
3313
+ },
3314
+ );
3315
+
3316
+ if (self.replica_count == 2 and !self.journal.faulty.bit(op)) {
3317
+ // This is required to avoid a liveness issue for a cluster-of-two where a new
3318
+ // leader learns of an op during a view change but where the op is faulty on
3319
+ // the old leader. We must immediately roll back the op since it could not have
3320
+ // been committed by the old leader if we know we do not have it, and because
3321
+ // the old leader cannot send a nack_prepare for its faulty copy.
3322
+ // For this to be correct, the recovery protocol must set all headers as faulty,
3323
+ // not only as dirty.
3324
+ self.discard_uncommitted_ops_from(op, checksum);
3325
+ return false;
3326
+ }
3327
+
3328
+ // Initialize the `nack_prepare` quorum counter for this uncommitted op:
3329
+ // It is also possible that we may start repairing a lower uncommitted op, having
3330
+ // initialized `nack_prepare_op` before we learn of a higher uncommitted dirty op,
3331
+ // in which case we also want to reset the quorum counter.
3332
+ if (self.nack_prepare_op) |nack_prepare_op| {
3333
+ assert(nack_prepare_op <= op);
3334
+ if (nack_prepare_op != op) {
3335
+ self.nack_prepare_op = op;
3336
+ self.reset_quorum_messages(
3337
+ &self.nack_prepare_from_other_replicas,
3338
+ .nack_prepare,
3339
+ );
3340
+ }
3341
+ } else {
3342
+ self.nack_prepare_op = op;
3343
+ self.reset_quorum_messages(
3344
+ &self.nack_prepare_from_other_replicas,
3345
+ .nack_prepare,
3346
+ );
3347
+ }
3348
+
3349
+ assert(self.nack_prepare_op.? == op);
3350
+ assert(request_prepare.context == checksum);
3351
+ self.send_header_to_other_replicas(request_prepare);
3352
+ } else {
3353
+ const nature = if (op > self.commit_max) "uncommitted" else "committed";
3354
+ const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
3355
+ log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
3356
+ self.replica,
3357
+ op,
3358
+ checksum,
3359
+ nature,
3360
+ reason,
3361
+ });
3362
+
3363
+ // We expect that `repair_prepare()` is called in reverse chronological order:
3364
+ // Any uncommitted ops should have already been dealt with.
3365
+ // We never roll back committed ops, and thus never regard `nack_prepare` responses.
3366
+ // Alternatively, we may not be the leader, in which case we do distinguish anyway.
3367
+ assert(self.nack_prepare_op == null);
3368
+ assert(request_prepare.context == checksum);
3369
+ if (self.choose_any_other_replica()) |replica| {
3370
+ self.send_header_to_replica(replica, request_prepare);
3371
+ }
3372
+ }
3373
+
3374
+ return true;
3375
+ }
3376
+
3377
+ fn repairs_allowed(self: *Self) bool {
3378
+ switch (self.status) {
3379
+ .view_change => {
3380
+ if (self.do_view_change_quorum) {
3381
+ assert(self.leader_index(self.view) == self.replica);
3382
+ return true;
3383
+ } else {
3384
+ return false;
3385
+ }
3386
+ },
3387
+ .normal => return true,
3388
+ else => return false,
3389
+ }
3390
+ }
3391
+
3392
+ /// Replicates to the next replica in the configuration (until we get back to the leader):
3393
+ /// Replication starts and ends with the leader, we never forward back to the leader.
3394
+ /// Does not flood the network with prepares that have already committed.
3395
+ /// TODO Use recent heartbeat data for next replica to leapfrog if faulty (optimization).
3396
+ fn replicate(self: *Self, message: *Message) void {
3397
+ assert(self.status == .normal);
3398
+ assert(message.header.command == .prepare);
3399
+ assert(message.header.view == self.view);
3400
+ assert(message.header.op == self.op);
3401
+
3402
+ if (message.header.op <= self.commit_max) {
3403
+ log.debug("{}: replicate: not replicating (committed)", .{self.replica});
3404
+ return;
3405
+ }
3406
+
3407
+ const next = @mod(self.replica + 1, @intCast(u8, self.replica_count));
3408
+ if (next == self.leader_index(message.header.view)) {
3409
+ log.debug("{}: replicate: not replicating (completed)", .{self.replica});
3410
+ return;
3411
+ }
3412
+
3413
+ log.debug("{}: replicate: replicating to replica {}", .{ self.replica, next });
3414
+ self.send_message_to_replica(next, message);
3415
+ }
3416
+
3417
+ /// Empties the prepare pipeline, unreffing all prepare and prepare_ok messages.
3418
+ /// Stops the prepare timeout and resets the timeouts counter.
3419
+ fn reset_pipeline(self: *Self) void {
3420
+ while (self.pipeline.pop()) |prepare| {
3421
+ self.unref_prepare_message_and_quorum_messages(&prepare);
3422
+ }
3423
+
3424
+ self.prepare_timeout.stop();
3425
+
3426
+ assert(self.pipeline.count == 0);
3427
+ assert(self.prepare_timeout.ticking == false);
3428
+
3429
+ // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
3430
+ // Otherwise, we would be making `repair_pipeline()` reentrant.
3431
+ }
3432
+
3433
+ fn reset_quorum_messages(self: *Self, messages: *QuorumMessages, command: Command) void {
3434
+ assert(messages.len == config.replicas_max);
3435
+ var view: ?u32 = null;
3436
+ var count: usize = 0;
3437
+ for (messages) |*received, replica| {
3438
+ if (received.*) |message| {
3439
+ assert(replica < self.replica_count);
3440
+ assert(message.header.command == command);
3441
+ assert(message.header.replica == replica);
3442
+ // We may have transitioned into a newer view:
3443
+ // However, all messages in the quorum should have the same view.
3444
+ assert(message.header.view <= self.view);
3445
+ if (view) |v| {
3446
+ assert(message.header.view == v);
3447
+ } else {
3448
+ view = message.header.view;
3449
+ }
3450
+
3451
+ self.message_bus.unref(message);
3452
+ count += 1;
3453
+ }
3454
+ received.* = null;
3455
+ }
3456
+ assert(count <= self.replica_count);
3457
+ log.debug("{}: reset {} {s} message(s)", .{ self.replica, count, @tagName(command) });
3458
+ }
3459
+
3460
+ fn reset_quorum_do_view_change(self: *Self) void {
3461
+ self.reset_quorum_messages(&self.do_view_change_from_all_replicas, .do_view_change);
3462
+ self.do_view_change_quorum = false;
3463
+ }
3464
+
3465
+ fn reset_quorum_nack_prepare(self: *Self) void {
3466
+ self.reset_quorum_messages(&self.nack_prepare_from_other_replicas, .nack_prepare);
3467
+ self.nack_prepare_op = null;
3468
+ }
3469
+
3470
+ fn reset_quorum_start_view_change(self: *Self) void {
3471
+ self.reset_quorum_messages(&self.start_view_change_from_other_replicas, .start_view_change);
3472
+ self.start_view_change_quorum = false;
3473
+ }
3474
+
3475
+ fn send_prepare_ok(self: *Self, header: *const Header) void {
3476
+ assert(header.command == .prepare);
3477
+ assert(header.cluster == self.cluster);
3478
+ assert(header.replica == self.leader_index(header.view));
3479
+ assert(header.view <= self.view);
3480
+ assert(header.op <= self.op or header.view < self.view);
3481
+
3482
+ if (self.status != .normal) {
3483
+ log.debug("{}: send_prepare_ok: not sending ({})", .{ self.replica, self.status });
3484
+ return;
3485
+ }
3486
+
3487
+ if (header.op > self.op) {
3488
+ assert(header.view < self.view);
3489
+ // An op may be reordered concurrently through a view change while being journalled:
3490
+ log.debug("{}: send_prepare_ok: not sending (reordered)", .{self.replica});
3491
+ return;
3492
+ }
3493
+
3494
+ assert(self.status == .normal);
3495
+ // After a view change, replicas send prepare_oks for uncommitted ops with older views:
3496
+ // However, we only send to the leader of the current view (see below where we send).
3497
+ assert(header.view <= self.view);
3498
+ assert(header.op <= self.op);
3499
+
3500
+ if (header.op <= self.commit_max) {
3501
+ log.debug("{}: send_prepare_ok: not sending (committed)", .{self.replica});
3502
+ return;
3503
+ }
3504
+
3505
+ if (self.journal.has_clean(header)) {
3506
+ log.debug("{}: send_prepare_ok: op={} checksum={}", .{
3507
+ self.replica,
3508
+ header.op,
3509
+ header.checksum,
3510
+ });
3511
+
3512
+ // It is crucial that replicas stop accepting prepare messages from earlier views
3513
+ // once they start the view change protocol. Without this constraint, the system
3514
+ // could get into a state in which there are two active primaries: the old one,
3515
+ // which hasn't failed but is merely slow or not well connected to the network, and
3516
+ // the new one. If a replica sent a prepare_ok message to the old primary after
3517
+ // sending its log to the new one, the old primary might commit an operation that
3518
+ // the new primary doesn't learn about in the do_view_change messages.
3519
+
3520
+ // We therefore only ever send to the leader of the current view, never to the
3521
+ // leader of the prepare header's view:
3522
+ self.send_header_to_replica(self.leader_index(self.view), .{
3523
+ .command = .prepare_ok,
3524
+ .parent = header.parent,
3525
+ .client = header.client,
3526
+ .context = header.checksum,
3527
+ .request = header.request,
3528
+ .cluster = self.cluster,
3529
+ .replica = self.replica,
3530
+ .epoch = header.epoch,
3531
+ .view = self.view,
3532
+ .op = header.op,
3533
+ .commit = header.commit,
3534
+ .offset = header.offset,
3535
+ .operation = header.operation,
3536
+ });
3537
+ } else {
3538
+ log.debug("{}: send_prepare_ok: not sending (dirty)", .{self.replica});
3539
+ return;
3540
+ }
3541
+ }
3542
+
3543
+ fn send_prepare_oks_after_view_change(self: *Self) void {
3544
+ assert(self.status == .normal);
3545
+
3546
+ var op = self.commit_max + 1;
3547
+ while (op <= self.op) : (op += 1) {
3548
+ // We may have breaks or stale headers in our uncommitted chain here. However:
3549
+ // * being able to send what we have will allow the pipeline to commit earlier, and
3550
+ // * the leader will drop any prepare_ok for a prepare not in the pipeline.
3551
+ // This is safe only because the leader can verify against the prepare checksum.
3552
+ if (self.journal.entry_for_op_exact(op)) |header| {
3553
+ self.send_prepare_ok(header);
3554
+ defer self.flush_loopback_queue();
3555
+ }
3556
+ }
3557
+ }
3558
+
3559
+ fn send_start_view_change(self: *Self) void {
3560
+ assert(self.status == .view_change);
3561
+ assert(!self.do_view_change_quorum);
3562
+ // Send only to other replicas (and not to ourself) to avoid a quorum off-by-one error:
3563
+ // This could happen if the replica mistakenly counts its own message in the quorum.
3564
+ self.send_header_to_other_replicas(.{
3565
+ .command = .start_view_change,
3566
+ .cluster = self.cluster,
3567
+ .replica = self.replica,
3568
+ .view = self.view,
3569
+ });
3570
+ }
3571
+
3572
+ fn send_do_view_change(self: *Self) void {
3573
+ assert(self.status == .view_change);
3574
+ assert(self.start_view_change_quorum);
3575
+ assert(!self.do_view_change_quorum);
3576
+ const count_start_view_change = self.count_quorum(
3577
+ &self.start_view_change_from_other_replicas,
3578
+ .start_view_change,
3579
+ 0,
3580
+ );
3581
+ assert(count_start_view_change >= self.quorum_view_change - 1);
3582
+
3583
+ const message = self.create_view_change_message(.do_view_change) orelse {
3584
+ log.alert("{}: send_do_view_change: waiting for message", .{self.replica});
3585
+ return;
3586
+ };
3587
+ defer self.message_bus.unref(message);
3588
+
3589
+ assert(message.references == 1);
3590
+ assert(message.header.command == .do_view_change);
3591
+ assert(message.header.view == self.view);
3592
+ assert(message.header.op == self.op);
3593
+ assert(message.header.commit == self.commit_max);
3594
+ // TODO Assert that latest header in message body matches self.op.
3595
+
3596
+ self.send_message_to_replica(self.leader_index(self.view), message);
3597
+ }
3598
+
3599
+ fn send_eviction_message_to_client(self: *Self, client: u128) void {
3600
+ assert(self.status == .normal);
3601
+ assert(self.leader());
3602
+
3603
+ log.alert("{}: too many sessions, sending eviction message to client={}", .{
3604
+ self.replica,
3605
+ client,
3606
+ });
3607
+
3608
+ self.send_header_to_client(client, .{
3609
+ .command = .eviction,
3610
+ .cluster = self.cluster,
3611
+ .replica = self.replica,
3612
+ .view = self.view,
3613
+ .client = client,
3614
+ });
3615
+ }
3616
+
3617
+ fn send_header_to_client(self: *Self, client: u128, header: Header) void {
3618
+ const message = self.create_message_from_header(header) orelse {
3619
+ log.alert("{}: no header-only message available, dropping message to client {}", .{
3620
+ self.replica,
3621
+ client,
3622
+ });
3623
+ return;
3624
+ };
3625
+ defer self.message_bus.unref(message);
3626
+
3627
+ self.message_bus.send_message_to_client(client, message);
3628
+ }
3629
+
3630
+ fn send_header_to_other_replicas(self: *Self, header: Header) void {
3631
+ const message = self.create_message_from_header(header) orelse {
3632
+ log.alert("{}: no header-only message available, dropping message to replicas", .{
3633
+ self.replica,
3634
+ });
3635
+ return;
3636
+ };
3637
+ defer self.message_bus.unref(message);
3638
+
3639
+ var replica: u8 = 0;
3640
+ while (replica < self.replica_count) : (replica += 1) {
3641
+ if (replica != self.replica) {
3642
+ self.send_message_to_replica(replica, message);
3643
+ }
3644
+ }
3645
+ }
3646
+
3647
+ fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
3648
+ const message = self.create_message_from_header(header) orelse {
3649
+ log.alert("{}: no header-only message available, dropping message to replica {}", .{
3650
+ self.replica,
3651
+ replica,
3652
+ });
3653
+ return;
3654
+ };
3655
+ defer self.message_bus.unref(message);
3656
+
3657
+ self.send_message_to_replica(replica, message);
3658
+ }
3659
+
3660
+ fn send_message_to_other_replicas(self: *Self, message: *Message) void {
3661
+ var replica: u8 = 0;
3662
+ while (replica < self.replica_count) : (replica += 1) {
3663
+ if (replica != self.replica) {
3664
+ self.send_message_to_replica(replica, message);
3665
+ }
3666
+ }
3667
+ }
3668
+
3669
+ fn send_message_to_replica(self: *Self, replica: u8, message: *Message) void {
3670
+ log.debug("{}: sending {s} to replica {}: {}", .{
3671
+ self.replica,
3672
+ @tagName(message.header.command),
3673
+ replica,
3674
+ message.header,
3675
+ });
3676
+
3677
+ if (message.header.invalid()) |reason| {
3678
+ log.emerg("{}: send_message_to_replica: invalid ({s})", .{ self.replica, reason });
3679
+ @panic("send_message_to_replica: invalid message");
3680
+ }
3681
+
3682
+ assert(message.header.cluster == self.cluster);
3683
+
3684
+ // TODO According to message.header.command, assert on the destination replica.
3685
+ switch (message.header.command) {
3686
+ .request => {
3687
+ // Do not assert message.header.replica because we forward .request messages.
3688
+ assert(self.status == .normal);
3689
+ assert(message.header.view <= self.view);
3690
+ },
3691
+ .prepare => {
3692
+ // Do not assert message.header.replica because we forward .prepare messages.
3693
+ switch (self.status) {
3694
+ .normal => assert(message.header.view <= self.view),
3695
+ .view_change => assert(message.header.view < self.view),
3696
+ else => unreachable,
3697
+ }
3698
+ },
3699
+ .prepare_ok => {
3700
+ assert(self.status == .normal);
3701
+ assert(message.header.view == self.view);
3702
+ // We must only ever send a prepare_ok to the latest leader of the active view:
3703
+ // We must never straddle views by sending to a leader in an older view.
3704
+ // Otherwise, we would be enabling a partitioned leader to commit.
3705
+ assert(replica == self.leader_index(self.view));
3706
+ assert(message.header.replica == self.replica);
3707
+ },
3708
+ .start_view_change => {
3709
+ assert(self.status == .view_change);
3710
+ assert(message.header.view == self.view);
3711
+ assert(message.header.replica == self.replica);
3712
+ },
3713
+ .do_view_change => {
3714
+ assert(self.status == .view_change);
3715
+ assert(self.start_view_change_quorum);
3716
+ assert(!self.do_view_change_quorum);
3717
+ assert(message.header.view == self.view);
3718
+ assert(message.header.replica == self.replica);
3719
+ assert(message.header.op == self.op);
3720
+ assert(replica == self.leader_index(self.view));
3721
+ },
3722
+ .start_view => switch (self.status) {
3723
+ .normal => {
3724
+ // A follower may ask the leader to resend the start_view message.
3725
+ assert(!self.start_view_change_quorum);
3726
+ assert(!self.do_view_change_quorum);
3727
+ assert(message.header.view == self.view);
3728
+ assert(message.header.replica == self.replica);
3729
+ },
3730
+ .view_change => {
3731
+ assert(self.start_view_change_quorum);
3732
+ assert(self.do_view_change_quorum);
3733
+ assert(message.header.view == self.view);
3734
+ assert(message.header.replica == self.replica);
3735
+ },
3736
+ else => unreachable,
3737
+ },
3738
+ .headers => {
3739
+ assert(self.status == .normal or self.status == .view_change);
3740
+ assert(message.header.view == self.view);
3741
+ assert(message.header.replica == self.replica);
3742
+ },
3743
+ .ping, .pong => {
3744
+ assert(message.header.view == self.view);
3745
+ assert(message.header.replica == self.replica);
3746
+ },
3747
+ .commit => {
3748
+ assert(self.status == .normal);
3749
+ assert(self.leader());
3750
+ assert(message.header.view == self.view);
3751
+ assert(message.header.replica == self.replica);
3752
+ },
3753
+ .request_headers => {
3754
+ assert(message.header.view == self.view);
3755
+ assert(message.header.replica == self.replica);
3756
+ },
3757
+ .request_prepare => {
3758
+ assert(message.header.view == self.view);
3759
+ assert(message.header.replica == self.replica);
3760
+ },
3761
+ .nack_prepare => {
3762
+ assert(message.header.view == self.view);
3763
+ assert(message.header.replica == self.replica);
3764
+ assert(replica == self.leader_index(self.view));
3765
+ },
3766
+ else => {
3767
+ log.notice("{}: send_message_to_replica: TODO {s}", .{
3768
+ self.replica,
3769
+ @tagName(message.header.command),
3770
+ });
3771
+ },
3772
+ }
3773
+
3774
+ if (replica == self.replica) {
3775
+ assert(self.loopback_queue == null);
3776
+ self.loopback_queue = message.ref();
3777
+ } else {
3778
+ self.message_bus.send_message_to_replica(replica, message);
3779
+ }
3780
+ }
3781
+
3782
+ /// Finds the header with the highest op number in a slice of headers from a replica.
3783
+ /// Searches only by op number to find the highest `self.op for the replica.
3784
+ fn set_latest_op(self: *Self, headers: []Header, latest: *Header) void {
3785
+ switch (latest.command) {
3786
+ .reserved, .prepare => assert(latest.valid_checksum()),
3787
+ else => unreachable,
3788
+ }
3789
+
3790
+ for (headers) |header| {
3791
+ assert(header.valid_checksum());
3792
+ assert(header.invalid() == null);
3793
+ assert(header.command == .prepare);
3794
+
3795
+ if (latest.command == .reserved or header.op > latest.op) {
3796
+ // We are simply trying to find the latest `self.op` in the replica's log.
3797
+ // We therefore do not compare views here.
3798
+ latest.* = header;
3799
+ }
3800
+ }
3801
+ }
3802
+
3803
+ fn set_latest_op_and_k(
3804
+ self: *Self,
3805
+ latest: *const Header,
3806
+ k: u64,
3807
+ method: []const u8,
3808
+ ) void {
3809
+ assert(self.status == .view_change);
3810
+
3811
+ assert(latest.valid_checksum());
3812
+ assert(latest.invalid() == null);
3813
+ assert(latest.command == .prepare);
3814
+ assert(latest.cluster == self.cluster);
3815
+
3816
+ // The view may have started already, so we can have a prepare in the same view:
3817
+ assert(latest.view <= self.view);
3818
+
3819
+ log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={} offset={}", .{
3820
+ self.replica,
3821
+ method,
3822
+ self.view,
3823
+ self.op,
3824
+ latest.op,
3825
+ self.commit_max,
3826
+ k,
3827
+ latest.checksum,
3828
+ latest.offset,
3829
+ });
3830
+
3831
+ // Uncommitted ops may not survive a view change so we must assert `latest.op` against
3832
+ // `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
3833
+ assert(latest.op >= self.commit_max);
3834
+ assert(latest.op >= latest.commit);
3835
+ assert(latest.op >= k);
3836
+ // We expect that `commit_max` (and `commit_min`) may be greater than `latest.commit`
3837
+ // because `latest.commit` is the commit number at the time the `latest.op` prepared.
3838
+ // We expect that `commit_max` (and `commit_min`) may also be greater even than `k`
3839
+ // because we may be the old leader joining towards the end of the view change and we
3840
+ // may have committed the `latest.op` already. However, this is bounded by pipelining.
3841
+ // The intersection property only requires that all "possibly" committed operations must
3842
+ // survive into the new view so that they can then be committed by the new leader. This
3843
+ // guarantees that if the old leader "possibly" committed the operation, then the new
3844
+ // leader will also commit the operation.
3845
+ if (k < self.commit_max and self.commit_min == self.commit_max) {
3846
+ log.debug("{}: {s}: k={} < commit_max={} and commit_min == commit_max", .{
3847
+ self.replica,
3848
+ method,
3849
+ k,
3850
+ self.commit_max,
3851
+ });
3852
+ }
3853
+ assert(k >= latest.commit);
3854
+ assert(k >= self.commit_max - std.math.min(config.pipelining_max, self.commit_max));
3855
+
3856
+ assert(self.commit_min <= self.commit_max);
3857
+ assert(self.op >= self.commit_max or self.op < self.commit_max);
3858
+
3859
+ self.op = latest.op;
3860
+ // Crucially, we must never rewind `commit_max` (and then `commit_min`) because
3861
+ // `commit_min` represents what we have already applied to our state machine:
3862
+ self.commit_max = std.math.max(self.commit_max, k);
3863
+
3864
+ assert(self.commit_min <= self.commit_max);
3865
+ assert(self.op >= self.commit_max);
3866
+
3867
+ // Do not set the latest op as dirty if we already have it exactly:
3868
+ // Otherwise, this would trigger a repair and delay the view change, or worse, it would
3869
+ // prevent us from assisting another replica to recover when we do in fact have the op.
3870
+ if (self.journal.entry_for_op_exact_with_checksum(latest.op, latest.checksum)) |_| {
3871
+ log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
3872
+ } else {
3873
+ self.journal.set_entry_as_dirty(latest);
3874
+ }
3875
+
3876
+ assert(self.op == latest.op);
3877
+ self.journal.remove_entries_from(self.op + 1);
3878
+ assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
3879
+ }
3880
+
3881
+ fn start_view_as_the_new_leader(self: *Self) void {
3882
+ assert(self.status == .view_change);
3883
+ assert(self.leader_index(self.view) == self.replica);
3884
+ assert(self.do_view_change_quorum);
3885
+
3886
+ assert(!self.committing);
3887
+ assert(!self.repairing_pipeline);
3888
+
3889
+ assert(self.commit_min == self.commit_max);
3890
+ assert(self.repair_pipeline_op() == null);
3891
+ assert(self.commit_max + self.pipeline.count == self.op);
3892
+ assert(self.valid_hash_chain_between(self.commit_min, self.op));
3893
+
3894
+ var pipeline_op = self.commit_max + 1;
3895
+ var pipeline_parent = self.journal.entry_for_op_exact(self.commit_max).?.checksum;
3896
+ var iterator = self.pipeline.iterator();
3897
+ while (iterator.next_ptr()) |prepare| {
3898
+ assert(prepare.message.header.command == .prepare);
3899
+ assert(prepare.message.header.op == pipeline_op);
3900
+ assert(prepare.message.header.parent == pipeline_parent);
3901
+
3902
+ pipeline_parent = prepare.message.header.checksum;
3903
+ pipeline_op += 1;
3904
+ }
3905
+ assert(self.pipeline.count <= config.pipelining_max);
3906
+ assert(self.commit_max + self.pipeline.count == pipeline_op - 1);
3907
+
3908
+ assert(self.journal.dirty.len == 0);
3909
+ assert(self.journal.faulty.len == 0);
3910
+ assert(self.nack_prepare_op == null);
3911
+
3912
+ const start_view = self.create_view_change_message(.start_view) orelse {
3913
+ log.alert("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
3914
+ return;
3915
+ };
3916
+ defer self.message_bus.unref(start_view);
3917
+
3918
+ self.transition_to_normal_status(self.view);
3919
+ // Detect if the transition to normal status above accidentally resets the pipeline:
3920
+ assert(self.commit_max + self.pipeline.count == self.op);
3921
+
3922
+ assert(self.status == .normal);
3923
+ assert(self.leader());
3924
+
3925
+ assert(start_view.references == 1);
3926
+ assert(start_view.header.command == .start_view);
3927
+ assert(start_view.header.view == self.view);
3928
+ assert(start_view.header.op == self.op);
3929
+ assert(start_view.header.commit == self.commit_max);
3930
+
3931
+ // Send prepare_ok messages to ourself to contribute to the pipeline.
3932
+ self.send_prepare_oks_after_view_change();
3933
+
3934
+ self.send_message_to_other_replicas(start_view);
3935
+ }
3936
+
3937
+ fn transition_to_normal_status(self: *Self, new_view: u32) void {
3938
+ log.debug("{}: transition_to_normal_status: view={}", .{ self.replica, new_view });
3939
+ // In the VRR paper it's possible to transition from normal to normal for the same view.
3940
+ // For example, this could happen after a state transfer triggered by an op jump.
3941
+ assert(new_view >= self.view);
3942
+ self.view = new_view;
3943
+ self.view_normal = new_view;
3944
+ self.status = .normal;
3945
+
3946
+ if (self.leader()) {
3947
+ log.debug("{}: transition_to_normal_status: leader", .{self.replica});
3948
+
3949
+ self.ping_timeout.start();
3950
+ self.commit_timeout.start();
3951
+ self.normal_status_timeout.stop();
3952
+ self.view_change_status_timeout.stop();
3953
+ self.view_change_message_timeout.stop();
3954
+ self.repair_timeout.start();
3955
+
3956
+ // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
3957
+ if (self.pipeline.count > 0) {
3958
+ assert(!self.prepare_timeout.ticking);
3959
+ self.prepare_timeout.start();
3960
+ }
3961
+ } else {
3962
+ log.debug("{}: transition_to_normal_status: follower", .{self.replica});
3963
+
3964
+ self.ping_timeout.start();
3965
+ self.commit_timeout.stop();
3966
+ self.normal_status_timeout.start();
3967
+ self.view_change_status_timeout.stop();
3968
+ self.view_change_message_timeout.stop();
3969
+ self.repair_timeout.start();
3970
+
3971
+ self.reset_pipeline();
3972
+ }
3973
+
3974
+ self.reset_quorum_start_view_change();
3975
+ self.reset_quorum_do_view_change();
3976
+ self.reset_quorum_nack_prepare();
3977
+
3978
+ assert(self.start_view_change_quorum == false);
3979
+ assert(self.do_view_change_quorum == false);
3980
+ assert(self.nack_prepare_op == null);
3981
+ }
3982
+
3983
+ /// A replica i that notices the need for a view change advances its view, sets its status to
3984
+ /// view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
3985
+ /// where v identifies the new view. A replica notices the need for a view change either based
3986
+ /// on its own timer, or because it receives a start_view_change or do_view_change message for
3987
+ /// a view with a larger number than its own view.
3988
+ fn transition_to_view_change_status(self: *Self, new_view: u32) void {
3989
+ log.debug("{}: transition_to_view_change_status: view={}..{}", .{
3990
+ self.replica,
3991
+ self.view,
3992
+ new_view,
3993
+ });
3994
+ assert(new_view > self.view);
3995
+ self.view = new_view;
3996
+ self.status = .view_change;
3997
+
3998
+ self.ping_timeout.stop();
3999
+ self.commit_timeout.stop();
4000
+ self.normal_status_timeout.stop();
4001
+ self.view_change_status_timeout.start();
4002
+ self.view_change_message_timeout.start();
4003
+ self.repair_timeout.stop();
4004
+
4005
+ // Do not reset quorum counters only on entering a view, assuming that the view will be
4006
+ // followed only by a single subsequent view change to the next view, because multiple
4007
+ // successive view changes can fail, e.g. after a view change timeout.
4008
+ // We must therefore reset our counters here to avoid counting messages from an older view,
4009
+ // which would violate the quorum intersection property essential for correctness.
4010
+ self.reset_pipeline();
4011
+ self.reset_quorum_start_view_change();
4012
+ self.reset_quorum_do_view_change();
4013
+ self.reset_quorum_nack_prepare();
4014
+
4015
+ assert(self.start_view_change_quorum == false);
4016
+ assert(self.do_view_change_quorum == false);
4017
+ assert(self.nack_prepare_op == null);
4018
+
4019
+ self.send_start_view_change();
4020
+ }
4021
+
4022
+ fn unref_prepare_message_and_quorum_messages(
4023
+ self: *Self,
4024
+ prepare: *const Prepare,
4025
+ ) void {
4026
+ self.message_bus.unref(prepare.message);
4027
+ for (prepare.ok_from_all_replicas) |received, replica| {
4028
+ if (received) |prepare_ok| {
4029
+ assert(replica < self.replica_count);
4030
+ self.message_bus.unref(prepare_ok);
4031
+ }
4032
+ }
4033
+ }
4034
+
4035
+ fn update_client_table_entry(self: *Self, reply: *Message) void {
4036
+ assert(reply.header.command == .reply);
4037
+ assert(reply.header.operation != .register);
4038
+ assert(reply.header.client > 0);
4039
+ assert(reply.header.context == 0);
4040
+ assert(reply.header.op == reply.header.commit);
4041
+ assert(reply.header.commit > 0);
4042
+ assert(reply.header.request > 0);
4043
+
4044
+ if (self.client_table.getPtr(reply.header.client)) |entry| {
4045
+ assert(entry.reply.header.command == .reply);
4046
+ assert(entry.reply.header.context == 0);
4047
+ assert(entry.reply.header.op == entry.reply.header.commit);
4048
+ assert(entry.reply.header.commit >= entry.session);
4049
+
4050
+ assert(entry.reply.header.client == reply.header.client);
4051
+ assert(entry.reply.header.request + 1 == reply.header.request);
4052
+ assert(entry.reply.header.op < reply.header.op);
4053
+ assert(entry.reply.header.commit < reply.header.commit);
4054
+
4055
+ // TODO Use this reply's prepare to cross-check against the entry's prepare, if we
4056
+ // still have access to the prepare in the journal (it may have been snapshotted).
4057
+
4058
+ log.debug("{}: update_client_table_entry: client={} session={} request={}", .{
4059
+ self.replica,
4060
+ reply.header.client,
4061
+ entry.session,
4062
+ reply.header.request,
4063
+ });
4064
+
4065
+ self.message_bus.unref(entry.reply);
4066
+ entry.reply = reply.ref();
4067
+ } else {
4068
+ // If no entry exists, then the session must have been evicted while being prepared.
4069
+ // We can still send the reply, the next request will receive an eviction message.
4070
+ }
4071
+ }
4072
+
4073
+ /// Whether it is safe to commit or send prepare_ok messages.
4074
+ /// Returns true if the hash chain is valid and up to date for the current view.
4075
+ /// This is a stronger guarantee than `valid_hash_chain_between()` below.
4076
+ fn valid_hash_chain(self: *Self, method: []const u8) bool {
4077
+ // If we know we could validate the hash chain even further, then wait until we can:
4078
+ // This is partial defense-in-depth in case `self.op` is ever advanced by a reordered op.
4079
+ if (self.op < self.commit_max) {
4080
+ log.debug("{}: {s}: waiting for repair (op={} < commit={})", .{
4081
+ self.replica,
4082
+ method,
4083
+ self.op,
4084
+ self.commit_max,
4085
+ });
4086
+ return false;
4087
+ }
4088
+
4089
+ // We must validate the hash chain as far as possible, since `self.op` may disclose a fork:
4090
+ if (!self.valid_hash_chain_between(self.commit_min, self.op)) {
4091
+ log.debug("{}: {s}: waiting for repair (hash chain)", .{ self.replica, method });
4092
+ return false;
4093
+ }
4094
+
4095
+ return true;
4096
+ }
4097
+
4098
+ /// Returns true if all operations are present, correctly ordered and connected by hash chain,
4099
+ /// between `op_min` and `op_max` (both inclusive).
4100
+ fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
4101
+ assert(op_min <= op_max);
4102
+
4103
+ // If we use anything less than self.op then we may commit ops for a forked hash chain that
4104
+ // have since been reordered by a new leader.
4105
+ assert(op_max == self.op);
4106
+ var b = self.journal.entry_for_op_exact(op_max).?;
4107
+
4108
+ var op = op_max;
4109
+ while (op > op_min) {
4110
+ op -= 1;
4111
+
4112
+ if (self.journal.entry_for_op_exact(op)) |a| {
4113
+ assert(a.op + 1 == b.op);
4114
+ if (a.checksum == b.parent) {
4115
+ assert(self.ascending_viewstamps(a, b));
4116
+ b = a;
4117
+ } else {
4118
+ log.debug("{}: valid_hash_chain_between: break: A: {}", .{ self.replica, a });
4119
+ log.debug("{}: valid_hash_chain_between: break: B: {}", .{ self.replica, b });
4120
+ return false;
4121
+ }
4122
+ } else {
4123
+ log.debug("{}: valid_hash_chain_between: missing op={}", .{ self.replica, op });
4124
+ return false;
4125
+ }
4126
+ }
4127
+ assert(b.op == op_min);
4128
+ return true;
4129
+ }
4130
+
4131
+ fn view_jump(self: *Self, header: *const Header) void {
4132
+ const to: Status = switch (header.command) {
4133
+ .prepare, .commit => .normal,
4134
+ .start_view_change, .do_view_change, .start_view => .view_change,
4135
+ else => unreachable,
4136
+ };
4137
+
4138
+ if (self.status != .normal and self.status != .view_change) return;
4139
+
4140
+ if (header.view < self.view) return;
4141
+
4142
+ // Compare status transitions and decide whether to view jump or ignore:
4143
+ switch (self.status) {
4144
+ .normal => switch (to) {
4145
+ // If the transition is to `.normal`, then ignore if for the same view:
4146
+ .normal => if (header.view == self.view) return,
4147
+ // If the transition is to `.view_change`, then ignore if the view has started:
4148
+ .view_change => if (header.view == self.view) return,
4149
+ else => unreachable,
4150
+ },
4151
+ .view_change => switch (to) {
4152
+ // This is an interesting special case:
4153
+ // If the transition is to `.normal` in the same view, then we missed the
4154
+ // `start_view` message and we must also consider this a view jump:
4155
+ // If we don't handle this below then our `view_change_status_timeout` will fire
4156
+ // and we will disrupt the cluster with another view change for a newer view.
4157
+ .normal => {},
4158
+ // If the transition is to `.view_change`, then ignore if for the same view:
4159
+ .view_change => if (header.view == self.view) return,
4160
+ else => unreachable,
4161
+ },
4162
+ else => unreachable,
4163
+ }
4164
+
4165
+ switch (to) {
4166
+ .normal => {
4167
+ if (header.view == self.view) {
4168
+ assert(self.status == .view_change);
4169
+
4170
+ log.debug("{}: view_jump: waiting to exit view change", .{self.replica});
4171
+ } else {
4172
+ assert(header.view > self.view);
4173
+ assert(self.status == .view_change or self.status == .normal);
4174
+
4175
+ log.debug("{}: view_jump: waiting to jump to newer view", .{self.replica});
4176
+ }
4177
+
4178
+ // TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
4179
+ log.debug("{}: view_jump: requesting start_view message", .{self.replica});
4180
+ self.send_header_to_replica(self.leader_index(header.view), .{
4181
+ .command = .request_start_view,
4182
+ .cluster = self.cluster,
4183
+ .replica = self.replica,
4184
+ .view = header.view,
4185
+ });
4186
+ },
4187
+ .view_change => {
4188
+ assert(header.view > self.view);
4189
+ assert(self.status == .view_change or self.status == .normal);
4190
+
4191
+ if (header.view == self.view + 1) {
4192
+ log.debug("{}: view_jump: jumping to view change", .{self.replica});
4193
+ } else {
4194
+ log.debug("{}: view_jump: jumping to next view change", .{self.replica});
4195
+ }
4196
+ self.transition_to_view_change_status(header.view);
4197
+ },
4198
+ else => unreachable,
4199
+ }
4200
+ }
4201
+
4202
+ fn write_prepare(self: *Self, message: *Message, trigger: Journal.Write.Trigger) void {
4203
+ assert(message.references > 0);
4204
+ assert(message.header.command == .prepare);
4205
+ assert(message.header.view <= self.view);
4206
+ assert(message.header.op <= self.op);
4207
+
4208
+ if (!self.journal.has(message.header)) {
4209
+ log.debug("{}: write_prepare: ignoring op={} checksum={} (header changed)", .{
4210
+ self.replica,
4211
+ message.header.op,
4212
+ message.header.checksum,
4213
+ });
4214
+ return;
4215
+ }
4216
+
4217
+ if (self.journal.writing(message.header.op, message.header.checksum)) {
4218
+ log.debug("{}: write_prepare: ignoring op={} checksum={} (already writing)", .{
4219
+ self.replica,
4220
+ message.header.op,
4221
+ message.header.checksum,
4222
+ });
4223
+ return;
4224
+ }
4225
+
4226
+ self.journal.write_prepare(write_prepare_on_write, message, trigger);
4227
+ }
4228
+
4229
+ fn write_prepare_on_write(
4230
+ self: *Self,
4231
+ wrote: ?*Message,
4232
+ trigger: Journal.Write.Trigger,
4233
+ ) void {
4234
+ // `null` indicates that we did not complete the write for some reason.
4235
+ const message = wrote orelse return;
4236
+
4237
+ self.send_prepare_ok(message.header);
4238
+ defer self.flush_loopback_queue();
4239
+
4240
+ switch (trigger) {
4241
+ .append => {},
4242
+ // If this was a repair, continue immediately to repair the next prepare:
4243
+ // This is an optimization to eliminate waiting until the next repair timeout.
4244
+ .repair => self.repair(),
4245
+ }
4246
+ }
4247
+ };
4248
+ }