tigerbeetle-node 0.11.8 → 0.11.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/package.json +4 -3
- package/scripts/build_lib.sh +41 -0
- package/src/node.zig +1 -1
- package/src/tigerbeetle/scripts/validate_docs.sh +7 -1
- package/src/tigerbeetle/src/benchmark.zig +3 -3
- package/src/tigerbeetle/src/config.zig +31 -16
- package/src/tigerbeetle/src/constants.zig +48 -9
- package/src/tigerbeetle/src/ewah.zig +5 -5
- package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
- package/src/tigerbeetle/src/lsm/binary_search.zig +1 -1
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +1 -1
- package/src/tigerbeetle/src/lsm/compaction.zig +34 -21
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +84 -104
- package/src/tigerbeetle/src/lsm/grid.zig +19 -13
- package/src/tigerbeetle/src/lsm/manifest_log.zig +8 -10
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +18 -13
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +1 -1
- package/src/tigerbeetle/src/lsm/segmented_array.zig +17 -17
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +1 -1
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +1 -1
- package/src/tigerbeetle/src/lsm/table.zig +8 -20
- package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/table_iterator.zig +3 -3
- package/src/tigerbeetle/src/lsm/table_mutable.zig +14 -2
- package/src/tigerbeetle/src/lsm/test.zig +5 -4
- package/src/tigerbeetle/src/lsm/tree.zig +1 -2
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +85 -115
- package/src/tigerbeetle/src/message_bus.zig +4 -4
- package/src/tigerbeetle/src/message_pool.zig +7 -10
- package/src/tigerbeetle/src/ring_buffer.zig +22 -12
- package/src/tigerbeetle/src/simulator.zig +366 -239
- package/src/tigerbeetle/src/state_machine/auditor.zig +5 -5
- package/src/tigerbeetle/src/state_machine/workload.zig +3 -3
- package/src/tigerbeetle/src/state_machine.zig +190 -178
- package/src/tigerbeetle/src/{util.zig → stdx.zig} +2 -0
- package/src/tigerbeetle/src/storage.zig +13 -6
- package/src/tigerbeetle/src/{test → testing/cluster}/message_bus.zig +3 -3
- package/src/tigerbeetle/src/{test → testing/cluster}/network.zig +46 -22
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +169 -0
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +202 -0
- package/src/tigerbeetle/src/testing/cluster.zig +443 -0
- package/src/tigerbeetle/src/{test → testing}/fuzz.zig +0 -0
- package/src/tigerbeetle/src/testing/hash_log.zig +66 -0
- package/src/tigerbeetle/src/{test → testing}/id.zig +0 -0
- package/src/tigerbeetle/src/testing/packet_simulator.zig +365 -0
- package/src/tigerbeetle/src/{test → testing}/priority_queue.zig +1 -1
- package/src/tigerbeetle/src/testing/reply_sequence.zig +139 -0
- package/src/tigerbeetle/src/{test → testing}/state_machine.zig +3 -1
- package/src/tigerbeetle/src/testing/storage.zig +757 -0
- package/src/tigerbeetle/src/{test → testing}/table.zig +21 -0
- package/src/tigerbeetle/src/{test → testing}/time.zig +0 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +2 -0
- package/src/tigerbeetle/src/tracer.zig +3 -3
- package/src/tigerbeetle/src/unit_tests.zig +4 -4
- package/src/tigerbeetle/src/vopr.zig +2 -2
- package/src/tigerbeetle/src/vsr/client.zig +5 -2
- package/src/tigerbeetle/src/vsr/clock.zig +93 -53
- package/src/tigerbeetle/src/vsr/journal.zig +109 -98
- package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +2 -2
- package/src/tigerbeetle/src/vsr/replica.zig +1983 -1430
- package/src/tigerbeetle/src/vsr/replica_format.zig +13 -13
- package/src/tigerbeetle/src/vsr/superblock.zig +240 -142
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -7
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +49 -14
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +38 -19
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +48 -48
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +51 -51
- package/src/tigerbeetle/src/vsr.zig +99 -33
- package/src/tigerbeetle/src/demo.zig +0 -132
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
- package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -37
- package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
- package/src/tigerbeetle/src/test/cluster.zig +0 -352
- package/src/tigerbeetle/src/test/conductor.zig +0 -366
- package/src/tigerbeetle/src/test/packet_simulator.zig +0 -398
- package/src/tigerbeetle/src/test/state_checker.zig +0 -169
- package/src/tigerbeetle/src/test/storage.zig +0 -864
- package/src/tigerbeetle/src/test/storage_checker.zig +0 -204
|
@@ -3,6 +3,7 @@ const Allocator = std.mem.Allocator;
|
|
|
3
3
|
const assert = std.debug.assert;
|
|
4
4
|
|
|
5
5
|
const constants = @import("../constants.zig");
|
|
6
|
+
const stdx = @import("../stdx.zig");
|
|
6
7
|
|
|
7
8
|
const StaticAllocator = @import("../static_allocator.zig");
|
|
8
9
|
const GridType = @import("../lsm/grid.zig").GridType;
|
|
@@ -24,26 +25,8 @@ const tracer = @import("../tracer.zig");
|
|
|
24
25
|
pub const Status = enum {
|
|
25
26
|
normal,
|
|
26
27
|
view_change,
|
|
27
|
-
// Recovery (for replica_count > 1):
|
|
28
|
-
//
|
|
29
|
-
// 1. Open the replica:
|
|
30
|
-
// a. At replica start: `status=recovering`.
|
|
31
|
-
// b. Recover the WAL. Mark questionable entries as faulty.
|
|
32
|
-
// c. If the WAL has no entries (besides the initial commit), skip to step 3 with view 0.
|
|
33
|
-
// 2. Run VSR recovery protocol:
|
|
34
|
-
// a. Send a `recovery` message to every replica (except self).
|
|
35
|
-
// b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
|
|
36
|
-
// Each `recovery_response` includes the current view number.
|
|
37
|
-
// Each `recovery_response` must include a nonce matching the `recovery` message.
|
|
38
|
-
// c. Wait for a `recovery_response` from the primary of the highest known view.
|
|
39
|
-
// 3. Transition to `status=normal` with the discovered view number:
|
|
40
|
-
// * Set `op` to the highest op in the primary's recovery response.
|
|
41
|
-
// * Repair faulty messages.
|
|
42
|
-
// * Commit through to the discovered `commit_max`.
|
|
43
|
-
// * Set `state_machine.prepare_timeout` to the current op's timestamp.
|
|
44
|
-
//
|
|
45
|
-
// TODO Document state transfer in this progression.
|
|
46
28
|
recovering,
|
|
29
|
+
recovering_head,
|
|
47
30
|
};
|
|
48
31
|
|
|
49
32
|
const Nonce = u128;
|
|
@@ -59,27 +42,17 @@ const Prepare = struct {
|
|
|
59
42
|
ok_quorum_received: bool = false,
|
|
60
43
|
};
|
|
61
44
|
|
|
45
|
+
const Request = struct {
|
|
46
|
+
message: *Message, // header.command == .request
|
|
47
|
+
realtime: i64,
|
|
48
|
+
};
|
|
49
|
+
|
|
62
50
|
const QuorumMessages = [constants.replicas_max]?*Message;
|
|
63
51
|
const quorum_messages_null = [_]?*Message{null} ** constants.replicas_max;
|
|
64
52
|
|
|
65
53
|
const QuorumCounter = std.StaticBitSet(constants.replicas_max);
|
|
66
54
|
const quorum_counter_null = QuorumCounter.initEmpty();
|
|
67
55
|
|
|
68
|
-
// CRITICAL: The number of prepare headers to include in the body:
|
|
69
|
-
// We must provide enough headers to cover all uncommitted headers so that the new
|
|
70
|
-
// primary (if we are in a view change) can decide whether to discard uncommitted headers
|
|
71
|
-
// that cannot be repaired because they are gaps, and this must be relative to the
|
|
72
|
-
// cluster as a whole (not relative to the difference between our op and commit number)
|
|
73
|
-
// as otherwise we would break correctness.
|
|
74
|
-
const view_change_headers_count = constants.pipeline_max;
|
|
75
|
-
|
|
76
|
-
comptime {
|
|
77
|
-
assert(view_change_headers_count > 0);
|
|
78
|
-
assert(view_change_headers_count >= constants.pipeline_max);
|
|
79
|
-
assert(view_change_headers_count <=
|
|
80
|
-
@divFloor(constants.message_size_max - @sizeOf(Header), @sizeOf(Header)));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
56
|
pub fn ReplicaType(
|
|
84
57
|
comptime StateMachine: type,
|
|
85
58
|
comptime MessageBus: type,
|
|
@@ -129,22 +102,44 @@ pub fn ReplicaType(
|
|
|
129
102
|
/// For executing service up-calls after an operation has been committed:
|
|
130
103
|
state_machine: StateMachine,
|
|
131
104
|
|
|
132
|
-
|
|
105
|
+
/// Durably store VSR state, the "root" of the LSM tree, and other replica metadata.
|
|
133
106
|
superblock: SuperBlock,
|
|
107
|
+
|
|
108
|
+
/// Context for SuperBlock.open() and .checkpoint().
|
|
134
109
|
superblock_context: SuperBlock.Context = undefined,
|
|
110
|
+
/// Context for SuperBlock.view_change().
|
|
111
|
+
superblock_context_view_change: SuperBlock.Context = undefined,
|
|
112
|
+
|
|
135
113
|
grid: Grid,
|
|
136
114
|
opened: bool,
|
|
137
115
|
|
|
138
|
-
/// The current view
|
|
116
|
+
/// The current view.
|
|
117
|
+
/// Initialized from the superblock's VSRState.
|
|
118
|
+
///
|
|
119
|
+
/// Invariants:
|
|
120
|
+
/// * `replica.view = replica.log_view` when status=normal
|
|
121
|
+
/// * `replica.view ≥ replica.log_view`
|
|
122
|
+
/// * `replica.view ≥ replica.view_durable`
|
|
123
|
+
/// * `replica.view = 0` when replica_count=1.
|
|
139
124
|
view: u32,
|
|
140
125
|
|
|
141
|
-
/// The latest view
|
|
142
|
-
|
|
126
|
+
/// The latest view where
|
|
127
|
+
/// - the replica was a primary and acquired a DVC quorum, or
|
|
128
|
+
/// - the replica was a backup and processed a SV message.
|
|
129
|
+
/// i.e. the latest view in which this replica changed its head message.
|
|
130
|
+
///
|
|
131
|
+
/// Initialized from the superblock's VSRState.
|
|
132
|
+
///
|
|
133
|
+
/// Invariants (see `view` for others):
|
|
134
|
+
/// * `replica.log_view ≥ replica.log_view_durable`
|
|
135
|
+
/// * `replica.log_view = 0` when replica_count=1.
|
|
136
|
+
log_view: u32,
|
|
143
137
|
|
|
144
138
|
/// The current status, either normal, view_change, or recovering:
|
|
145
139
|
status: Status = .recovering,
|
|
146
140
|
|
|
147
141
|
/// The op number assigned to the most recently prepared operation.
|
|
142
|
+
/// This op is sometimes referred to as the replica's "head" or "head op".
|
|
148
143
|
///
|
|
149
144
|
/// Invariants (not applicable during status=recovering):
|
|
150
145
|
/// * `replica.op` exists in the Journal.
|
|
@@ -159,10 +154,6 @@ pub fn ReplicaType(
|
|
|
159
154
|
// Also verify that a corresponding header exists in the WAL.
|
|
160
155
|
op: u64,
|
|
161
156
|
|
|
162
|
-
/// The op of the highest checkpointed message.
|
|
163
|
-
// TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
|
|
164
|
-
op_checkpoint: u64,
|
|
165
|
-
|
|
166
157
|
/// The op number of the latest committed and executed operation (according to the replica):
|
|
167
158
|
/// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
|
|
168
159
|
///
|
|
@@ -190,15 +181,20 @@ pub fn ReplicaType(
|
|
|
190
181
|
/// * checkpointing
|
|
191
182
|
committing: bool = false,
|
|
192
183
|
|
|
193
|
-
/// Whether we are reading a prepare from storage
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
/// The
|
|
197
|
-
///
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
184
|
+
/// Whether we are reading a prepare from storage to construct the pipeline.
|
|
185
|
+
pipeline_repairing: bool = false,
|
|
186
|
+
|
|
187
|
+
/// The pipeline is a queue for a replica which is the primary and in status=normal.
|
|
188
|
+
/// At all other times the pipeline is a cache.
|
|
189
|
+
pipeline: union(enum) {
|
|
190
|
+
/// The primary's pipeline of inflight prepares waiting to commit in FIFO order,
|
|
191
|
+
/// with a tail of pending requests which have not begun to prepare.
|
|
192
|
+
/// This allows us to pipeline without the complexity of out-of-order commits.
|
|
193
|
+
queue: PipelineQueue,
|
|
194
|
+
/// Prepares in the cache may be committed or uncommitted, and may not belong to the
|
|
195
|
+
/// current view.
|
|
196
|
+
cache: PipelineCache,
|
|
197
|
+
},
|
|
202
198
|
|
|
203
199
|
/// In some cases, a replica may send a message to itself. We do not submit these messages
|
|
204
200
|
/// to the message bus but rather queue them here for guaranteed immediate delivery, which
|
|
@@ -214,9 +210,6 @@ pub fn ReplicaType(
|
|
|
214
210
|
/// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
|
|
215
211
|
nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
|
|
216
212
|
|
|
217
|
-
/// Unique recovery_response messages from OTHER replicas (excluding ourself).
|
|
218
|
-
recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
|
|
219
|
-
|
|
220
213
|
/// Whether a replica has received a quorum of start_view_change messages for the view change:
|
|
221
214
|
start_view_change_quorum: bool = false,
|
|
222
215
|
|
|
@@ -254,9 +247,6 @@ pub fn ReplicaType(
|
|
|
254
247
|
/// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
|
|
255
248
|
repair_timeout: Timeout,
|
|
256
249
|
|
|
257
|
-
/// The number of ticks before attempting to send another set of `recovery` messages.
|
|
258
|
-
recovery_timeout: Timeout,
|
|
259
|
-
|
|
260
250
|
/// The nonce of the `recovery` messages.
|
|
261
251
|
recovery_nonce: Nonce,
|
|
262
252
|
|
|
@@ -268,6 +258,7 @@ pub fn ReplicaType(
|
|
|
268
258
|
/// Seeded with the replica's index number.
|
|
269
259
|
prng: std.rand.DefaultPrng,
|
|
270
260
|
|
|
261
|
+
context: ?*anyopaque = null,
|
|
271
262
|
/// Simulator hooks.
|
|
272
263
|
on_change_state: ?fn (replica: *const Self) void = null,
|
|
273
264
|
/// Called immediately after a compaction.
|
|
@@ -355,33 +346,83 @@ pub fn ReplicaType(
|
|
|
355
346
|
// Open the (Forest inside) StateMachine:
|
|
356
347
|
self.opened = false;
|
|
357
348
|
self.state_machine.open(state_machine_open_callback);
|
|
358
|
-
while (!self.opened)
|
|
359
|
-
// self.grid.tick();
|
|
360
|
-
self.superblock.storage.tick();
|
|
361
|
-
}
|
|
349
|
+
while (!self.opened) self.superblock.storage.tick();
|
|
362
350
|
|
|
363
351
|
self.opened = false;
|
|
364
352
|
self.journal.recover(journal_recover_callback);
|
|
365
353
|
while (!self.opened) self.superblock.storage.tick();
|
|
366
354
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
355
|
+
const vsr_headers = self.superblock.working.vsr_headers();
|
|
356
|
+
var op_head: u64 = vsr_headers.slice[0].op;
|
|
357
|
+
for (self.journal.headers) |*header| {
|
|
358
|
+
if (header.command == .prepare and header.op > op_head) {
|
|
359
|
+
assert(self.log_view >= header.view);
|
|
360
|
+
assert(self.log_view == self.view);
|
|
361
|
+
|
|
362
|
+
op_head = header.op;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
self.op = op_head;
|
|
367
|
+
for (vsr_headers.slice) |*header| {
|
|
368
|
+
const slot = .{ .index = header.op % constants.journal_slot_count };
|
|
369
|
+
if (self.journal.has(header)) {
|
|
370
|
+
// Header is already in the WAL.
|
|
371
|
+
assert(!self.journal.dirty.bit(slot));
|
|
372
|
+
assert(!self.journal.faulty.bit(slot));
|
|
373
|
+
} else if (self.journal.header_for_op(header.op)) |journal_header| {
|
|
374
|
+
assert(!self.journal.dirty.bit(slot));
|
|
375
|
+
assert(!self.journal.faulty.bit(slot));
|
|
376
|
+
|
|
377
|
+
if (header.op < journal_header.op) {
|
|
378
|
+
// Don't overwrite a newer op.
|
|
379
|
+
// (This must be a SV message because a DVC would not have a newer op).
|
|
380
|
+
assert(self.log_view == self.view);
|
|
381
|
+
} else {
|
|
382
|
+
self.journal.set_header_as_dirty(header);
|
|
383
|
+
}
|
|
384
|
+
} else {
|
|
385
|
+
assert(self.journal.dirty.bit(slot) == self.journal.faulty.bit(slot));
|
|
386
|
+
|
|
387
|
+
self.journal.headers[slot.index] = header.*;
|
|
388
|
+
self.journal.dirty.set(slot);
|
|
389
|
+
// Don't touch faulty — if it is set, we don't want to unset it. The WAL slot
|
|
390
|
+
// may contain a corrupt version is this op, and we don't want to incorrectly
|
|
391
|
+
// nack it. (This is why we do not call replace_header()/set_header_as_dirty()
|
|
392
|
+
// here.)
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
const header_head = self.journal.header_with_op(self.op).?;
|
|
397
|
+
assert(header_head.view <= self.superblock.working.vsr_state.log_view);
|
|
398
|
+
|
|
399
|
+
if (self.replica_count == 1) {
|
|
400
|
+
if (self.journal.faulty.count > 0) {
|
|
401
|
+
@panic("journal is corrupt");
|
|
402
|
+
}
|
|
403
|
+
assert(self.op_head_certain());
|
|
404
|
+
|
|
405
|
+
if (self.commit_min < self.op) {
|
|
406
|
+
self.commit_journal(self.op);
|
|
407
|
+
} else {
|
|
408
|
+
self.transition_to_normal_from_recovering_status();
|
|
409
|
+
}
|
|
383
410
|
} else {
|
|
384
|
-
|
|
411
|
+
// Even if op_head_certain() returns false, a DVC always has a certain head op.
|
|
412
|
+
if (self.log_view < self.view or self.op_head_certain()) {
|
|
413
|
+
if (self.log_view == self.view) {
|
|
414
|
+
if (self.primary_index(self.view) == self.replica) {
|
|
415
|
+
self.transition_to_view_change_status(self.view + 1);
|
|
416
|
+
} else {
|
|
417
|
+
self.transition_to_normal_from_recovering_status();
|
|
418
|
+
}
|
|
419
|
+
} else {
|
|
420
|
+
assert(self.view > self.log_view);
|
|
421
|
+
self.transition_to_view_change_status(self.view);
|
|
422
|
+
}
|
|
423
|
+
} else {
|
|
424
|
+
self.transition_to_recovering_head();
|
|
425
|
+
}
|
|
385
426
|
}
|
|
386
427
|
}
|
|
387
428
|
|
|
@@ -504,11 +545,11 @@ pub fn ReplicaType(
|
|
|
504
545
|
.grid = self.grid,
|
|
505
546
|
.opened = self.opened,
|
|
506
547
|
.view = self.superblock.working.vsr_state.view,
|
|
507
|
-
.
|
|
548
|
+
.log_view = self.superblock.working.vsr_state.log_view,
|
|
508
549
|
.op = 0,
|
|
509
|
-
.op_checkpoint = self.superblock.working.vsr_state.commit_min,
|
|
510
550
|
.commit_min = self.superblock.working.vsr_state.commit_min,
|
|
511
551
|
.commit_max = self.superblock.working.vsr_state.commit_max,
|
|
552
|
+
.pipeline = .{ .cache = .{} },
|
|
512
553
|
.ping_timeout = Timeout{
|
|
513
554
|
.name = "ping_timeout",
|
|
514
555
|
.id = replica_index,
|
|
@@ -544,11 +585,6 @@ pub fn ReplicaType(
|
|
|
544
585
|
.id = replica_index,
|
|
545
586
|
.after = 50,
|
|
546
587
|
},
|
|
547
|
-
.recovery_timeout = Timeout{
|
|
548
|
-
.name = "recovery_timeout",
|
|
549
|
-
.id = replica_index,
|
|
550
|
-
.after = 200,
|
|
551
|
-
},
|
|
552
588
|
.recovery_nonce = recovery_nonce,
|
|
553
589
|
.prng = std.rand.DefaultPrng.init(replica_index),
|
|
554
590
|
};
|
|
@@ -586,7 +622,11 @@ pub fn ReplicaType(
|
|
|
586
622
|
self.grid.deinit(allocator);
|
|
587
623
|
defer self.message_bus.deinit(allocator);
|
|
588
624
|
|
|
589
|
-
|
|
625
|
+
// TODO(Zig) 0.10: inline-switch.
|
|
626
|
+
switch (self.pipeline) {
|
|
627
|
+
.queue => |*pipeline| pipeline.deinit(self.message_bus.pool),
|
|
628
|
+
.cache => |*pipeline| pipeline.deinit(self.message_bus.pool),
|
|
629
|
+
}
|
|
590
630
|
|
|
591
631
|
if (self.loopback_queue) |loopback_message| {
|
|
592
632
|
assert(loopback_message.next == null);
|
|
@@ -606,10 +646,6 @@ pub fn ReplicaType(
|
|
|
606
646
|
for (self.do_view_change_from_all_replicas) |message| {
|
|
607
647
|
if (message) |m| self.message_bus.unref(m);
|
|
608
648
|
}
|
|
609
|
-
|
|
610
|
-
for (self.recovery_response_from_other_replicas) |message| {
|
|
611
|
-
if (message) |m| self.message_bus.unref(m);
|
|
612
|
-
}
|
|
613
649
|
}
|
|
614
650
|
|
|
615
651
|
/// The client table records for each client the latest session and the latest committed reply.
|
|
@@ -629,36 +665,8 @@ pub fn ReplicaType(
|
|
|
629
665
|
|
|
630
666
|
// TODO Replica owns Time; should it tick() here instead of Clock?
|
|
631
667
|
self.clock.tick();
|
|
632
|
-
// self.grid.tick();
|
|
633
668
|
self.message_bus.tick();
|
|
634
669
|
|
|
635
|
-
if (self.status == .recovering) {
|
|
636
|
-
if (self.recovery_timeout.ticking) {
|
|
637
|
-
// Continue running the VSR recovery protocol.
|
|
638
|
-
self.recovery_timeout.tick();
|
|
639
|
-
if (self.recovery_timeout.fired()) self.on_recovery_timeout();
|
|
640
|
-
} else if (self.replica_count == 1) {
|
|
641
|
-
// A cluster-of-one does not run the VSR recovery protocol.
|
|
642
|
-
if (self.committing) return;
|
|
643
|
-
assert(self.journal.faulty.count == 0);
|
|
644
|
-
assert(self.op == 0);
|
|
645
|
-
// TODO Assert that this path isn't taken more than once.
|
|
646
|
-
self.op = self.journal.op_maximum();
|
|
647
|
-
assert(self.op >= self.commit_min);
|
|
648
|
-
assert(self.op >= self.op_checkpoint);
|
|
649
|
-
assert(self.op <= self.op_checkpoint_trigger());
|
|
650
|
-
assert(self.journal.header_with_op(self.op) != null);
|
|
651
|
-
self.commit_journal(self.op);
|
|
652
|
-
// The recovering→normal transition is deferred until all ops are committed.
|
|
653
|
-
} else {
|
|
654
|
-
// The journal just finished recovery.
|
|
655
|
-
// Now try to learn the current view via the VSR recovery protocol.
|
|
656
|
-
self.recovery_timeout.start();
|
|
657
|
-
self.recover();
|
|
658
|
-
}
|
|
659
|
-
return;
|
|
660
|
-
}
|
|
661
|
-
|
|
662
670
|
self.ping_timeout.tick();
|
|
663
671
|
self.prepare_timeout.tick();
|
|
664
672
|
self.commit_timeout.tick();
|
|
@@ -725,8 +733,6 @@ pub fn ReplicaType(
|
|
|
725
733
|
.start_view_change => self.on_start_view_change(message),
|
|
726
734
|
.do_view_change => self.on_do_view_change(message),
|
|
727
735
|
.start_view => self.on_start_view(message),
|
|
728
|
-
.recovery => self.on_recovery(message),
|
|
729
|
-
.recovery_response => self.on_recovery_response(message),
|
|
730
736
|
.request_start_view => self.on_request_start_view(message),
|
|
731
737
|
.request_prepare => self.on_request_prepare(message),
|
|
732
738
|
.request_headers => self.on_request_headers(message),
|
|
@@ -807,18 +813,22 @@ pub fn ReplicaType(
|
|
|
807
813
|
self.clock.learn(message.header.replica, m0, t1, m2);
|
|
808
814
|
}
|
|
809
815
|
|
|
810
|
-
///
|
|
811
|
-
///
|
|
812
|
-
///
|
|
813
|
-
///
|
|
814
|
-
/// it
|
|
816
|
+
/// When there is free space in the pipeline's prepare queue:
|
|
817
|
+
/// The primary advances op-number, adds the request to the end of the log, and updates the
|
|
818
|
+
/// information for this client in the client-table to contain the new request number, s.
|
|
819
|
+
/// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
|
|
820
|
+
/// current view-number, m is the message it received from the client, n is the op-number
|
|
821
|
+
/// it assigned to the request, and k is the commit-number.
|
|
822
|
+
/// Otherwise, when there is room in the pipeline's request queue:
|
|
823
|
+
/// The request is queued, and will be dequeued & prepared when the pipeline head commits.
|
|
824
|
+
/// Otherwise, drop the request.
|
|
815
825
|
fn on_request(self: *Self, message: *Message) void {
|
|
816
826
|
if (self.ignore_request_message(message)) return;
|
|
817
827
|
|
|
818
828
|
assert(self.status == .normal);
|
|
819
829
|
assert(self.primary());
|
|
820
830
|
assert(self.commit_min == self.commit_max);
|
|
821
|
-
assert(self.commit_max + self.pipeline.count == self.op);
|
|
831
|
+
assert(self.commit_max + self.pipeline.queue.prepare_queue.count == self.op);
|
|
822
832
|
|
|
823
833
|
assert(message.header.command == .request);
|
|
824
834
|
assert(message.header.view <= self.view); // The client's view may be behind ours.
|
|
@@ -828,59 +838,16 @@ pub fn ReplicaType(
|
|
|
828
838
|
return;
|
|
829
839
|
};
|
|
830
840
|
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
// The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
|
|
836
|
-
// may be our first prepare as a recently elected primary:
|
|
837
|
-
std.math.max(
|
|
838
|
-
self.state_machine.prepare_timestamp,
|
|
839
|
-
self.state_machine.commit_timestamp,
|
|
840
|
-
) + 1,
|
|
841
|
-
@intCast(u64, realtime),
|
|
842
|
-
);
|
|
843
|
-
assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
|
|
844
|
-
|
|
845
|
-
const prepare_timestamp = self.state_machine.prepare(
|
|
846
|
-
message.header.operation.cast(StateMachine),
|
|
847
|
-
message.body(),
|
|
848
|
-
);
|
|
849
|
-
|
|
850
|
-
const latest_entry = self.journal.header_with_op(self.op).?;
|
|
851
|
-
message.header.parent = latest_entry.checksum;
|
|
852
|
-
message.header.context = message.header.checksum;
|
|
853
|
-
message.header.view = self.view;
|
|
854
|
-
message.header.op = self.op + 1;
|
|
855
|
-
message.header.commit = self.commit_max;
|
|
856
|
-
message.header.timestamp = prepare_timestamp;
|
|
857
|
-
message.header.replica = self.replica;
|
|
858
|
-
message.header.command = .prepare;
|
|
859
|
-
|
|
860
|
-
message.header.set_checksum_body(message.body());
|
|
861
|
-
message.header.set_checksum();
|
|
862
|
-
|
|
863
|
-
log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
|
|
864
|
-
|
|
865
|
-
self.pipeline.push_assume_capacity(.{ .message = message.ref() });
|
|
866
|
-
assert(self.pipeline.count >= 1);
|
|
841
|
+
const request = .{
|
|
842
|
+
.message = message.ref(),
|
|
843
|
+
.realtime = realtime,
|
|
844
|
+
};
|
|
867
845
|
|
|
868
|
-
if (self.pipeline.
|
|
869
|
-
|
|
870
|
-
assert(!self.prepare_timeout.ticking);
|
|
871
|
-
self.prepare_timeout.start();
|
|
846
|
+
if (self.pipeline.queue.prepare_queue.full()) {
|
|
847
|
+
self.pipeline.queue.push_request(request);
|
|
872
848
|
} else {
|
|
873
|
-
|
|
874
|
-
assert(self.prepare_timeout.ticking);
|
|
875
|
-
const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
|
|
876
|
-
assert(previous.message.header.checksum == message.header.parent);
|
|
849
|
+
self.primary_pipeline_prepare(request);
|
|
877
850
|
}
|
|
878
|
-
|
|
879
|
-
self.on_prepare(message);
|
|
880
|
-
|
|
881
|
-
// We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
|
|
882
|
-
// This is critical to ensure that pipelined prepares do not receive the same op number.
|
|
883
|
-
assert(self.op == message.header.op);
|
|
884
851
|
}
|
|
885
852
|
|
|
886
853
|
/// Replication is simple, with a single code path for the primary and backups.
|
|
@@ -937,7 +904,7 @@ pub fn ReplicaType(
|
|
|
937
904
|
log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
938
905
|
self.replica,
|
|
939
906
|
message.header.op,
|
|
940
|
-
self.op_checkpoint,
|
|
907
|
+
self.op_checkpoint(),
|
|
941
908
|
});
|
|
942
909
|
// When we are the primary, `on_request` enforces this invariant.
|
|
943
910
|
assert(self.backup());
|
|
@@ -948,7 +915,7 @@ pub fn ReplicaType(
|
|
|
948
915
|
assert(message.header.view == self.view);
|
|
949
916
|
assert(self.primary() or self.backup());
|
|
950
917
|
assert(message.header.replica == self.primary_index(message.header.view));
|
|
951
|
-
assert(message.header.op > self.op_checkpoint);
|
|
918
|
+
assert(message.header.op > self.op_checkpoint());
|
|
952
919
|
assert(message.header.op > self.op);
|
|
953
920
|
assert(message.header.op > self.commit_min);
|
|
954
921
|
assert(message.header.op <= self.op_checkpoint_trigger());
|
|
@@ -998,11 +965,20 @@ pub fn ReplicaType(
|
|
|
998
965
|
assert(message.header.view == self.view);
|
|
999
966
|
assert(self.primary());
|
|
1000
967
|
|
|
1001
|
-
const prepare = self.
|
|
968
|
+
const prepare = self.pipeline.queue.prepare_by_prepare_ok(message) orelse {
|
|
969
|
+
// This can be normal, for example, if an old prepare_ok is replayed.
|
|
970
|
+
log.debug("{}: on_prepare_ok: not preparing ok={} checksum={}", .{
|
|
971
|
+
self.replica,
|
|
972
|
+
message.header.op,
|
|
973
|
+
message.header.context,
|
|
974
|
+
});
|
|
975
|
+
return;
|
|
976
|
+
};
|
|
1002
977
|
|
|
1003
978
|
assert(prepare.message.header.checksum == message.header.context);
|
|
1004
979
|
assert(prepare.message.header.op >= self.commit_max + 1);
|
|
1005
|
-
assert(prepare.message.header.op <= self.commit_max +
|
|
980
|
+
assert(prepare.message.header.op <= self.commit_max +
|
|
981
|
+
self.pipeline.queue.prepare_queue.count);
|
|
1006
982
|
assert(prepare.message.header.op <= self.op);
|
|
1007
983
|
|
|
1008
984
|
// Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
|
|
@@ -1010,7 +986,7 @@ pub fn ReplicaType(
|
|
|
1010
986
|
// TODO: When Block recover & state transfer are implemented, this can be removed.
|
|
1011
987
|
const threshold =
|
|
1012
988
|
if (prepare.message.header.op == self.op_checkpoint_trigger() or
|
|
1013
|
-
prepare.message.header.op == self.op_checkpoint + constants.lsm_batch_multiple + 1)
|
|
989
|
+
prepare.message.header.op == self.op_checkpoint() + constants.lsm_batch_multiple + 1)
|
|
1014
990
|
self.replica_count
|
|
1015
991
|
else
|
|
1016
992
|
self.quorum_replication;
|
|
@@ -1199,28 +1175,6 @@ pub fn ReplicaType(
|
|
|
1199
1175
|
/// informs the other replicas of the completion of the view change by sending
|
|
1200
1176
|
/// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
|
|
1201
1177
|
/// op number, and k is the commit number.
|
|
1202
|
-
///
|
|
1203
|
-
/// For each DVC in the quorum:
|
|
1204
|
-
///
|
|
1205
|
-
/// * The headers must all belong to the same hash chain. (Gaps are allowed).
|
|
1206
|
-
/// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
|
|
1207
|
-
/// loaded into the new primary with `replace_header()`, not `repair_header()`).
|
|
1208
|
-
///
|
|
1209
|
-
/// Across all DVCs in the quorum:
|
|
1210
|
-
///
|
|
1211
|
-
/// * The headers of every DVC with the same view_normal must agree. In other words:
|
|
1212
|
-
/// dvc₁.headers[i].op == dvc₂.headers[j].op implies
|
|
1213
|
-
/// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
|
|
1214
|
-
/// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
|
|
1215
|
-
/// loaded into the new primary with `replace_header()`, not `repair_header()`).
|
|
1216
|
-
///
|
|
1217
|
-
/// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
|
|
1218
|
-
/// (e.g. the write is still queued). The header is either:
|
|
1219
|
-
///
|
|
1220
|
-
/// * committed — so another replica in the quorum must have a copy, according to the quorum
|
|
1221
|
-
/// intersection property. Or,
|
|
1222
|
-
/// * uncommitted — if the header is chosen, but cannot be recovered from any replica, then
|
|
1223
|
-
/// it will be discarded by the nack protocol.
|
|
1224
1178
|
fn on_do_view_change(self: *Self, message: *Message) void {
|
|
1225
1179
|
if (self.ignore_view_change_message(message)) return;
|
|
1226
1180
|
|
|
@@ -1255,6 +1209,7 @@ pub fn ReplicaType(
|
|
|
1255
1209
|
|
|
1256
1210
|
assert(count == threshold);
|
|
1257
1211
|
assert(self.do_view_change_from_all_replicas[self.replica] != null);
|
|
1212
|
+
DVCQuorum.verify(self.do_view_change_from_all_replicas);
|
|
1258
1213
|
log.debug("{}: on_do_view_change: view={} quorum received", .{
|
|
1259
1214
|
self.replica,
|
|
1260
1215
|
self.view,
|
|
@@ -1265,6 +1220,13 @@ pub fn ReplicaType(
|
|
|
1265
1220
|
self.do_view_change_quorum = true;
|
|
1266
1221
|
|
|
1267
1222
|
self.primary_set_log_from_do_view_change_messages();
|
|
1223
|
+
// We aren't status=normal yet, but our headers from our prior log_view may have been
|
|
1224
|
+
// replaced. If we participate in another DVC (before reaching status=normal, which
|
|
1225
|
+
// would update our log_view), we must disambiguate our (new) headers from the
|
|
1226
|
+
// headers of any other replica with the same log_view so that the next primary can
|
|
1227
|
+
// identify an unambiguous set of canonical headers.
|
|
1228
|
+
self.log_view = self.view;
|
|
1229
|
+
|
|
1268
1230
|
assert(self.op >= self.commit_max);
|
|
1269
1231
|
assert(self.state_machine.prepare_timestamp >=
|
|
1270
1232
|
self.journal.header_with_op(self.op).?.timestamp);
|
|
@@ -1295,7 +1257,9 @@ pub fn ReplicaType(
|
|
|
1295
1257
|
unreachable;
|
|
1296
1258
|
}
|
|
1297
1259
|
|
|
1298
|
-
assert(self.status == .view_change or
|
|
1260
|
+
assert(self.status == .view_change or
|
|
1261
|
+
self.status == .normal or
|
|
1262
|
+
self.status == .recovering_head);
|
|
1299
1263
|
assert(message.header.view >= self.view);
|
|
1300
1264
|
assert(message.header.replica != self.replica);
|
|
1301
1265
|
assert(message.header.replica == self.primary_index(message.header.view));
|
|
@@ -1307,13 +1271,23 @@ pub fn ReplicaType(
|
|
|
1307
1271
|
assert(message.header.op == op_highest(message_body_as_headers(message)));
|
|
1308
1272
|
|
|
1309
1273
|
self.set_op_and_commit_max(message.header.op, message.header.commit, "on_start_view");
|
|
1310
|
-
|
|
1274
|
+
for (message_body_as_headers_chain_consecutive(message)) |*header| {
|
|
1275
|
+
self.replace_header(header);
|
|
1276
|
+
}
|
|
1311
1277
|
|
|
1312
1278
|
assert(self.op == message.header.op);
|
|
1313
1279
|
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1280
|
+
switch (self.status) {
|
|
1281
|
+
.normal => {},
|
|
1282
|
+
.view_change => {
|
|
1283
|
+
self.transition_to_normal_from_view_change_status(message.header.view);
|
|
1284
|
+
self.send_prepare_oks_after_view_change();
|
|
1285
|
+
},
|
|
1286
|
+
.recovering_head => {
|
|
1287
|
+
self.transition_to_normal_from_recovering_status();
|
|
1288
|
+
self.send_prepare_oks_after_view_change();
|
|
1289
|
+
},
|
|
1290
|
+
.recovering => unreachable,
|
|
1317
1291
|
}
|
|
1318
1292
|
|
|
1319
1293
|
assert(self.status == .normal);
|
|
@@ -1329,6 +1303,7 @@ pub fn ReplicaType(
|
|
|
1329
1303
|
if (self.ignore_repair_message(message)) return;
|
|
1330
1304
|
|
|
1331
1305
|
assert(self.status == .normal);
|
|
1306
|
+
assert(self.view == self.log_view);
|
|
1332
1307
|
assert(message.header.view == self.view);
|
|
1333
1308
|
assert(message.header.replica != self.replica);
|
|
1334
1309
|
assert(self.primary());
|
|
@@ -1345,391 +1320,90 @@ pub fn ReplicaType(
|
|
|
1345
1320
|
self.send_message_to_replica(message.header.replica, start_view);
|
|
1346
1321
|
}
|
|
1347
1322
|
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
}
|
|
1360
|
-
|
|
1361
|
-
const response = self.message_bus.get_message();
|
|
1362
|
-
defer self.message_bus.unref(response);
|
|
1323
|
+
/// If the requested prepare has been guaranteed by this replica:
|
|
1324
|
+
/// * Read the prepare from storage, and forward it to the replica that requested it.
|
|
1325
|
+
/// * Otherwise send no reply — it isn't safe to nack.
|
|
1326
|
+
/// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
|
|
1327
|
+
///
|
|
1328
|
+
/// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
|
|
1329
|
+
/// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
|
|
1330
|
+
/// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
|
|
1331
|
+
/// to restore durability.
|
|
1332
|
+
fn on_request_prepare(self: *Self, message: *const Message) void {
|
|
1333
|
+
if (self.ignore_repair_message(message)) return;
|
|
1363
1334
|
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
self.commit_max,
|
|
1369
|
-
message.header.context,
|
|
1370
|
-
});
|
|
1335
|
+
assert(self.replica_count > 1);
|
|
1336
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
1337
|
+
assert(message.header.view == self.view);
|
|
1338
|
+
assert(message.header.replica != self.replica);
|
|
1371
1339
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
.op = self.op,
|
|
1379
|
-
.commit = self.commit_max,
|
|
1340
|
+
const op = message.header.op;
|
|
1341
|
+
const slot = self.journal.slot_for_op(op);
|
|
1342
|
+
const checksum: ?u128 = switch (message.header.timestamp) {
|
|
1343
|
+
0 => null,
|
|
1344
|
+
1 => message.header.context,
|
|
1345
|
+
else => unreachable,
|
|
1380
1346
|
};
|
|
1381
1347
|
|
|
1382
|
-
//
|
|
1383
|
-
|
|
1384
|
-
//
|
|
1385
|
-
// replica_count 3
|
|
1386
|
-
// do_view_change.headers.len 3 (= pipeline_max)
|
|
1387
|
-
// recovery_response.headers.len 2 (!)
|
|
1388
|
-
// replica 0 log 3, 4a, 5a, 6a, 7a, 8a (status=normal, primary)
|
|
1389
|
-
// replica 1 log 3, 4a, 5a, --, --, -- (status=normal, backup)
|
|
1390
|
-
// replica 2 log 3, 4b, 5b, --, --, -- (status=recovering)
|
|
1391
|
-
//
|
|
1392
|
-
// 1. Replica 2 receives a recovery_response quorum.
|
|
1393
|
-
// 2. Replica 2 sets `replica.op` to 8a.
|
|
1394
|
-
// 3. Replica 2 sets its headers from the primary's recovery_response (8a, 7a)
|
|
1395
|
-
// (via `replace_header()`).
|
|
1396
|
-
// 4. Replica 2 transitions to status=normal.
|
|
1397
|
-
// 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
|
|
1398
|
-
// 6. Replica 1 initiates a view change.
|
|
1399
|
-
// 7. Replica 1 collects a DVC quorum:
|
|
1400
|
-
// replica 1: 3, 4a, 5a (view_normal=latest)
|
|
1401
|
-
// replica 2: 5b, 7a, 8a (view_normal=latest)
|
|
1402
|
-
// Replicas 1 and 2 share the highest view_normal, so both sets of headers are canonical.
|
|
1403
|
-
// 8. Replica 1 loads the canonical headers (via `replace_header()`) from both DVCs.
|
|
1404
|
-
// Messages 8a and 7a will be dropped via `do_view_change_op_max()` (due to the
|
|
1405
|
-
// gap at op 6). But there is a conflict at op=5. For correctness, replica 1 must
|
|
1406
|
-
// pick 5a — 5a may be committed by replica 0.
|
|
1407
|
-
// Without replica 0's assistance, replica 1 has no way to pick between 5a/5b.
|
|
1408
|
-
//
|
|
1409
|
-
// Including at least as many headers in the recovery response as the DVC maintains the
|
|
1410
|
-
// invariant: DVCs with the same view_normal must never disagree on the identity of a
|
|
1411
|
-
// message.
|
|
1412
|
-
//
|
|
1413
|
-
// (DVCs can still safely include gaps — but they must be of the form [4a,__,6a],
|
|
1414
|
-
// not [4a,__,6b]).
|
|
1415
|
-
const count = self.copy_latest_headers_and_set_size(
|
|
1416
|
-
0,
|
|
1417
|
-
self.op,
|
|
1418
|
-
view_change_headers_count,
|
|
1419
|
-
response,
|
|
1420
|
-
);
|
|
1421
|
-
assert(count > 0); // We expect that self.op always exists.
|
|
1422
|
-
assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
|
|
1423
|
-
|
|
1424
|
-
response.header.set_checksum_body(response.body());
|
|
1425
|
-
response.header.set_checksum();
|
|
1426
|
-
|
|
1427
|
-
assert(self.status == .normal);
|
|
1428
|
-
// The checksum for a recovery message is deterministic, and cannot be used as a nonce:
|
|
1429
|
-
assert(response.header.context != message.header.checksum);
|
|
1430
|
-
|
|
1431
|
-
self.send_message_to_replica(message.header.replica, response);
|
|
1432
|
-
}
|
|
1433
|
-
|
|
1434
|
-
fn on_recovery_response(self: *Self, message: *Message) void {
|
|
1435
|
-
assert(self.replica_count > 1);
|
|
1348
|
+
// Only the primary may respond to `request_prepare` messages without a checksum.
|
|
1349
|
+
assert(checksum != null or self.primary_index(self.view) == self.replica);
|
|
1436
1350
|
|
|
1437
|
-
|
|
1438
|
-
|
|
1351
|
+
// Try to serve the message directly from the pipeline.
|
|
1352
|
+
// This saves us from going to disk. And we don't need to worry that the WAL's copy
|
|
1353
|
+
// of an uncommitted prepare is lost/corrupted.
|
|
1354
|
+
if (self.pipeline_prepare_by_op_and_checksum(op, checksum)) |prepare| {
|
|
1355
|
+
log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
|
|
1439
1356
|
self.replica,
|
|
1440
|
-
|
|
1357
|
+
op,
|
|
1358
|
+
checksum,
|
|
1441
1359
|
});
|
|
1360
|
+
self.send_message_to_replica(message.header.replica, prepare);
|
|
1442
1361
|
return;
|
|
1443
1362
|
}
|
|
1444
1363
|
|
|
1445
|
-
if (
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
return;
|
|
1453
|
-
}
|
|
1454
|
-
|
|
1455
|
-
var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
|
|
1456
|
-
if (responses[message.header.replica]) |existing| {
|
|
1457
|
-
assert(message.header.replica == existing.header.replica);
|
|
1458
|
-
|
|
1459
|
-
if (message.header.checksum == existing.header.checksum) {
|
|
1460
|
-
// The response was replayed by the network; ignore it.
|
|
1461
|
-
log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
|
|
1364
|
+
if (self.journal.prepare_inhabited[slot.index]) {
|
|
1365
|
+
const prepare_checksum = self.journal.prepare_checksums[slot.index];
|
|
1366
|
+
// Consult `journal.prepare_checksums` (rather than `journal.headers`):
|
|
1367
|
+
// the former may have the prepare we want — even if journal recovery marked the
|
|
1368
|
+
// slot as faulty and left the in-memory header as reserved.
|
|
1369
|
+
if (checksum == null or checksum.? == prepare_checksum) {
|
|
1370
|
+
log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
|
|
1462
1371
|
self.replica,
|
|
1372
|
+
op,
|
|
1373
|
+
checksum,
|
|
1463
1374
|
});
|
|
1464
|
-
return;
|
|
1465
|
-
}
|
|
1466
1375
|
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
existing.header.op,
|
|
1480
|
-
message.header.op,
|
|
1481
|
-
existing.header.commit,
|
|
1482
|
-
message.header.commit,
|
|
1483
|
-
},
|
|
1484
|
-
);
|
|
1376
|
+
// Improve availability by calling `read_prepare_with_op_and_checksum` instead
|
|
1377
|
+
// of `read_prepare` — even if `journal.headers` contains the target message.
|
|
1378
|
+
// The latter skips the read when the target prepare is present but dirty (e.g.
|
|
1379
|
+
// it was recovered with decision=fix).
|
|
1380
|
+
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1381
|
+
// this particular destination replica.
|
|
1382
|
+
self.journal.read_prepare_with_op_and_checksum(
|
|
1383
|
+
on_request_prepare_read,
|
|
1384
|
+
op,
|
|
1385
|
+
prepare_checksum,
|
|
1386
|
+
message.header.replica,
|
|
1387
|
+
);
|
|
1485
1388
|
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
message.header.op < existing.header.op) or
|
|
1489
|
-
(message.header.view == existing.header.view and
|
|
1490
|
-
message.header.op == existing.header.op and
|
|
1491
|
-
message.header.commit < existing.header.commit))
|
|
1492
|
-
{
|
|
1493
|
-
// The second message is older than the first one (reordered packets).
|
|
1494
|
-
log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
|
|
1389
|
+
// We have guaranteed the prepare (not safe to nack).
|
|
1390
|
+
// Our copy may or may not be valid, but we will try to read & forward it.
|
|
1495
1391
|
return;
|
|
1496
1392
|
}
|
|
1393
|
+
}
|
|
1497
1394
|
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
.{
|
|
1511
|
-
self.replica,
|
|
1512
|
-
message.header.replica,
|
|
1513
|
-
message.header.view,
|
|
1514
|
-
message.header.op,
|
|
1515
|
-
message.header.commit,
|
|
1516
|
-
},
|
|
1517
|
-
);
|
|
1518
|
-
}
|
|
1519
|
-
|
|
1520
|
-
assert(responses[message.header.replica] == null);
|
|
1521
|
-
responses[message.header.replica] = message.ref();
|
|
1522
|
-
|
|
1523
|
-
// Wait until we have:
|
|
1524
|
-
// * at least `f + 1` messages for quorum (not including ourself), and
|
|
1525
|
-
// * a response from the primary of the highest discovered view.
|
|
1526
|
-
const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
|
|
1527
|
-
assert(count <= self.replica_count - 1);
|
|
1528
|
-
|
|
1529
|
-
const threshold = self.quorum_view_change;
|
|
1530
|
-
if (count < threshold) {
|
|
1531
|
-
log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
|
|
1532
|
-
self.replica,
|
|
1533
|
-
count,
|
|
1534
|
-
threshold,
|
|
1535
|
-
});
|
|
1536
|
-
return;
|
|
1537
|
-
}
|
|
1538
|
-
|
|
1539
|
-
const view = blk: { // The latest known view.
|
|
1540
|
-
var view: u32 = 0;
|
|
1541
|
-
for (self.recovery_response_from_other_replicas) |received, replica| {
|
|
1542
|
-
if (received) |response| {
|
|
1543
|
-
assert(replica != self.replica);
|
|
1544
|
-
assert(response.header.replica == replica);
|
|
1545
|
-
assert(response.header.context == self.recovery_nonce);
|
|
1546
|
-
|
|
1547
|
-
view = std.math.max(view, response.header.view);
|
|
1548
|
-
}
|
|
1549
|
-
}
|
|
1550
|
-
break :blk view;
|
|
1551
|
-
};
|
|
1552
|
-
|
|
1553
|
-
const primary_response = responses[self.primary_index(view)];
|
|
1554
|
-
if (primary_response == null) {
|
|
1555
|
-
log.debug(
|
|
1556
|
-
"{}: on_recovery_response: ignoring (awaiting response from primary of view={})",
|
|
1557
|
-
.{
|
|
1558
|
-
self.replica,
|
|
1559
|
-
view,
|
|
1560
|
-
},
|
|
1561
|
-
);
|
|
1562
|
-
return;
|
|
1563
|
-
}
|
|
1564
|
-
|
|
1565
|
-
if (primary_response.?.header.view != view) {
|
|
1566
|
-
// The primary (according to the view quorum) isn't the primary (according to itself).
|
|
1567
|
-
// The `recovery_timeout` will retry shortly with another round.
|
|
1568
|
-
log.debug(
|
|
1569
|
-
"{}: on_recovery_response: ignoring (primary view={} != quorum view={})",
|
|
1570
|
-
.{
|
|
1571
|
-
self.replica,
|
|
1572
|
-
primary_response.?.header.view,
|
|
1573
|
-
view,
|
|
1574
|
-
},
|
|
1575
|
-
);
|
|
1576
|
-
return;
|
|
1577
|
-
}
|
|
1578
|
-
|
|
1579
|
-
// This recovering→normal status transition occurs exactly once.
|
|
1580
|
-
// All further `recovery_response` messages are ignored.
|
|
1581
|
-
|
|
1582
|
-
// TODO When the view is recovered from the superblock (instead of via the VSR recovery
|
|
1583
|
-
// protocol), if the view number indicates that this replica is a primary, it must
|
|
1584
|
-
// transition to status=view_change instead of status=normal.
|
|
1585
|
-
|
|
1586
|
-
const primary_headers = message_body_as_headers(primary_response.?);
|
|
1587
|
-
assert(primary_headers.len > 0);
|
|
1588
|
-
|
|
1589
|
-
const commit = primary_response.?.header.commit;
|
|
1590
|
-
{
|
|
1591
|
-
const op = op_highest(primary_headers);
|
|
1592
|
-
assert(op == primary_response.?.header.op);
|
|
1593
|
-
|
|
1594
|
-
self.set_op_and_commit_max(op, commit, "on_recovery_response");
|
|
1595
|
-
|
|
1596
|
-
// TODO If the view's primary is >1 WAL ahead of us, these headers could cause
|
|
1597
|
-
// problems. We don't want to jump this far ahead to repair, but we still need to
|
|
1598
|
-
// use the hash chain to figure out which headers to request. Maybe include our
|
|
1599
|
-
// `op_checkpoint` in the recovery (request) message so that the response can give
|
|
1600
|
-
// more useful (i.e. older) headers.
|
|
1601
|
-
self.replace_headers(primary_headers);
|
|
1602
|
-
|
|
1603
|
-
if (self.op < constants.journal_slot_count) {
|
|
1604
|
-
if (self.journal.header_with_op(0)) |header| {
|
|
1605
|
-
assert(header.command == .prepare);
|
|
1606
|
-
assert(header.operation == .root);
|
|
1607
|
-
} else {
|
|
1608
|
-
// This is the first wrap of the log, and the root prepare is corrupt.
|
|
1609
|
-
// Repair the root repair. This is necessary to maintain the invariant that
|
|
1610
|
-
// the op=commit_min exists in-memory.
|
|
1611
|
-
//
|
|
1612
|
-
// op=0 wouldn't have been repaired by replace_headers above, because it is
|
|
1613
|
-
// already "checkpointed".
|
|
1614
|
-
const header = Header.root_prepare(self.cluster);
|
|
1615
|
-
self.journal.set_header_as_dirty(&header);
|
|
1616
|
-
log.debug("{}: on_recovery_response: repair root op", .{self.replica});
|
|
1617
|
-
}
|
|
1618
|
-
}
|
|
1619
|
-
|
|
1620
|
-
assert(self.op == op);
|
|
1621
|
-
assert(self.journal.header_with_op(self.op) != null);
|
|
1622
|
-
}
|
|
1623
|
-
|
|
1624
|
-
assert(self.status == .recovering);
|
|
1625
|
-
self.transition_to_normal_from_recovering_status(view);
|
|
1626
|
-
assert(self.status == .normal);
|
|
1627
|
-
assert(self.backup());
|
|
1628
|
-
|
|
1629
|
-
log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
|
|
1630
|
-
" commit={} dirty={} faulty={}", .{
|
|
1631
|
-
self.replica,
|
|
1632
|
-
count,
|
|
1633
|
-
view,
|
|
1634
|
-
primary_headers[primary_headers.len - 1].op,
|
|
1635
|
-
primary_headers[0].op,
|
|
1636
|
-
commit,
|
|
1637
|
-
self.journal.dirty.count,
|
|
1638
|
-
self.journal.faulty.count,
|
|
1639
|
-
});
|
|
1640
|
-
|
|
1641
|
-
self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
|
|
1642
|
-
// `state_machine.commit_timestamp` is updated as messages are committed.
|
|
1643
|
-
|
|
1644
|
-
self.reset_quorum_recovery_response();
|
|
1645
|
-
self.commit_journal(commit);
|
|
1646
|
-
self.repair();
|
|
1647
|
-
}
|
|
1648
|
-
|
|
1649
|
-
/// If the requested prepare has been guaranteed by this replica:
|
|
1650
|
-
/// * Read the prepare from storage, and forward it to the replica that requested it.
|
|
1651
|
-
/// * Otherwise send no reply — it isn't safe to nack.
|
|
1652
|
-
/// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
|
|
1653
|
-
///
|
|
1654
|
-
/// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
|
|
1655
|
-
/// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
|
|
1656
|
-
/// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
|
|
1657
|
-
/// to restore durability.
|
|
1658
|
-
fn on_request_prepare(self: *Self, message: *const Message) void {
|
|
1659
|
-
if (self.ignore_repair_message(message)) return;
|
|
1660
|
-
|
|
1661
|
-
assert(self.replica_count > 1);
|
|
1662
|
-
assert(self.status == .normal or self.status == .view_change);
|
|
1663
|
-
assert(message.header.view == self.view);
|
|
1664
|
-
assert(message.header.replica != self.replica);
|
|
1665
|
-
|
|
1666
|
-
const op = message.header.op;
|
|
1667
|
-
const slot = self.journal.slot_for_op(op);
|
|
1668
|
-
const checksum: ?u128 = switch (message.header.timestamp) {
|
|
1669
|
-
0 => null,
|
|
1670
|
-
1 => message.header.context,
|
|
1671
|
-
else => unreachable,
|
|
1672
|
-
};
|
|
1673
|
-
|
|
1674
|
-
// Only the primary may respond to `request_prepare` messages without a checksum.
|
|
1675
|
-
assert(checksum != null or self.primary_index(self.view) == self.replica);
|
|
1676
|
-
|
|
1677
|
-
// Try to serve the message directly from the pipeline.
|
|
1678
|
-
// This saves us from going to disk. And we don't need to worry that the WAL's copy
|
|
1679
|
-
// of an uncommitted prepare is lost/corrupted.
|
|
1680
|
-
if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
|
|
1681
|
-
log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
|
|
1682
|
-
self.replica,
|
|
1683
|
-
op,
|
|
1684
|
-
checksum,
|
|
1685
|
-
});
|
|
1686
|
-
self.send_message_to_replica(message.header.replica, prepare.message);
|
|
1687
|
-
return;
|
|
1688
|
-
}
|
|
1689
|
-
|
|
1690
|
-
if (self.journal.prepare_inhabited[slot.index]) {
|
|
1691
|
-
const prepare_checksum = self.journal.prepare_checksums[slot.index];
|
|
1692
|
-
// Consult `journal.prepare_checksums` (rather than `journal.headers`):
|
|
1693
|
-
// the former may have the prepare we want — even if journal recovery marked the
|
|
1694
|
-
// slot as faulty and left the in-memory header as reserved.
|
|
1695
|
-
if (checksum == null or checksum.? == prepare_checksum) {
|
|
1696
|
-
log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
|
|
1697
|
-
self.replica,
|
|
1698
|
-
op,
|
|
1699
|
-
checksum,
|
|
1700
|
-
});
|
|
1701
|
-
|
|
1702
|
-
// Improve availability by calling `read_prepare_with_op_and_checksum` instead
|
|
1703
|
-
// of `read_prepare` — even if `journal.headers` contains the target message.
|
|
1704
|
-
// The latter skips the read when the target prepare is present but dirty (e.g.
|
|
1705
|
-
// it was recovered with decision=fix).
|
|
1706
|
-
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1707
|
-
// this particular destination replica.
|
|
1708
|
-
self.journal.read_prepare_with_op_and_checksum(
|
|
1709
|
-
on_request_prepare_read,
|
|
1710
|
-
op,
|
|
1711
|
-
prepare_checksum,
|
|
1712
|
-
message.header.replica,
|
|
1713
|
-
);
|
|
1714
|
-
|
|
1715
|
-
// We have guaranteed the prepare (not safe to nack).
|
|
1716
|
-
// Our copy may or may not be valid, but we will try to read & forward it.
|
|
1717
|
-
return;
|
|
1718
|
-
}
|
|
1719
|
-
}
|
|
1720
|
-
|
|
1721
|
-
{
|
|
1722
|
-
// We may have guaranteed the prepare but our copy is faulty (not safe to nack).
|
|
1723
|
-
if (self.journal.faulty.bit(slot)) return;
|
|
1724
|
-
if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
|
|
1725
|
-
if (self.journal.dirty.bit(slot)) {
|
|
1726
|
-
// We know of the prepare but have yet to write it (safe to nack).
|
|
1727
|
-
// Continue through below...
|
|
1728
|
-
} else {
|
|
1729
|
-
// We have guaranteed the prepare and our copy is clean (not safe to nack).
|
|
1730
|
-
return;
|
|
1731
|
-
}
|
|
1732
|
-
}
|
|
1395
|
+
{
|
|
1396
|
+
// We may have guaranteed the prepare but our copy is faulty (not safe to nack).
|
|
1397
|
+
if (self.journal.faulty.bit(slot)) return;
|
|
1398
|
+
if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
|
|
1399
|
+
if (self.journal.dirty.bit(slot)) {
|
|
1400
|
+
// We know of the prepare but have yet to write it (safe to nack).
|
|
1401
|
+
// Continue through below...
|
|
1402
|
+
} else {
|
|
1403
|
+
// We have guaranteed the prepare and our copy is clean (not safe to nack).
|
|
1404
|
+
return;
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1733
1407
|
}
|
|
1734
1408
|
|
|
1735
1409
|
// Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
|
|
@@ -1970,8 +1644,9 @@ pub fn ReplicaType(
|
|
|
1970
1644
|
assert(self.status == .normal);
|
|
1971
1645
|
assert(self.primary());
|
|
1972
1646
|
|
|
1973
|
-
const prepare = self.pipeline.head_ptr().?;
|
|
1647
|
+
const prepare = self.pipeline.queue.prepare_queue.head_ptr().?;
|
|
1974
1648
|
assert(prepare.message.header.command == .prepare);
|
|
1649
|
+
assert(prepare.message.header.op == self.commit_min + 1);
|
|
1975
1650
|
|
|
1976
1651
|
if (prepare.ok_quorum_received) {
|
|
1977
1652
|
self.prepare_timeout.reset();
|
|
@@ -2017,10 +1692,10 @@ pub fn ReplicaType(
|
|
|
2017
1692
|
// We may be slow and waiting for the write to complete.
|
|
2018
1693
|
//
|
|
2019
1694
|
// We may even have maxed out our IO depth and been unable to initiate the write,
|
|
2020
|
-
// which can happen if `constants.
|
|
2021
|
-
// This can lead to deadlock for a cluster of
|
|
2022
|
-
// since there is no other way for the primary
|
|
2023
|
-
// other replica has it.
|
|
1695
|
+
// which can happen if `constants.pipeline_prepare_queue_max` exceeds
|
|
1696
|
+
// `constants.journal_iops_write_max`. This can lead to deadlock for a cluster of
|
|
1697
|
+
// one or two (if we do not retry here), since there is no other way for the primary
|
|
1698
|
+
// to repair the dirty op because no other replica has it.
|
|
2024
1699
|
//
|
|
2025
1700
|
// Retry the write through `on_repair()` which will work out which is which.
|
|
2026
1701
|
// We do expect that the op would have been run through `on_prepare()` already.
|
|
@@ -2107,13 +1782,6 @@ pub fn ReplicaType(
|
|
|
2107
1782
|
self.repair();
|
|
2108
1783
|
}
|
|
2109
1784
|
|
|
2110
|
-
fn on_recovery_timeout(self: *Self) void {
|
|
2111
|
-
assert(self.status == .recovering);
|
|
2112
|
-
assert(self.replica_count > 1);
|
|
2113
|
-
self.recovery_timeout.reset();
|
|
2114
|
-
self.recover();
|
|
2115
|
-
}
|
|
2116
|
-
|
|
2117
1785
|
fn reference_message_and_receive_quorum_exactly_once(
|
|
2118
1786
|
self: *Self,
|
|
2119
1787
|
messages: *QuorumMessages,
|
|
@@ -2301,7 +1969,7 @@ pub fn ReplicaType(
|
|
|
2301
1969
|
assert(message.header.view == self.view);
|
|
2302
1970
|
assert(message.header.op == self.op);
|
|
2303
1971
|
|
|
2304
|
-
if (self.replica_count == 1 and self.pipeline.count > 1) {
|
|
1972
|
+
if (self.replica_count == 1 and self.pipeline.queue.prepare_queue.count > 1) {
|
|
2305
1973
|
// In a cluster-of-one, the prepares must always be written to the WAL sequentially
|
|
2306
1974
|
// (never concurrently). This ensures that there will be no gaps in the WAL during
|
|
2307
1975
|
// crash recovery.
|
|
@@ -2364,10 +2032,9 @@ pub fn ReplicaType(
|
|
|
2364
2032
|
/// A function which calls `commit_journal()` to set `commit_max` must first call
|
|
2365
2033
|
/// `view_jump()`. Otherwise, we may fork the log.
|
|
2366
2034
|
fn commit_journal(self: *Self, commit: u64) void {
|
|
2367
|
-
// TODO Restrict `view_change` status only to the primary purely as defense-in-depth.
|
|
2368
|
-
// Be careful of concurrency when doing this, as successive view changes can happen quickly.
|
|
2369
2035
|
assert(self.status == .normal or self.status == .view_change or
|
|
2370
2036
|
(self.status == .recovering and self.replica_count == 1));
|
|
2037
|
+
assert(!(self.status == .normal and self.primary()));
|
|
2371
2038
|
assert(self.commit_min <= self.commit_max);
|
|
2372
2039
|
assert(self.commit_min <= self.op);
|
|
2373
2040
|
assert(self.commit_max <= self.op or self.commit_max > self.op);
|
|
@@ -2392,6 +2059,7 @@ pub fn ReplicaType(
|
|
|
2392
2059
|
log.debug("{}: commit_journal: already committing...", .{self.replica});
|
|
2393
2060
|
return;
|
|
2394
2061
|
}
|
|
2062
|
+
assert(!(self.status == .normal and self.primary()));
|
|
2395
2063
|
|
|
2396
2064
|
// We check the hash chain before we read each op, rather than once upfront, because
|
|
2397
2065
|
// it's possible for `commit_max` to change while we read asynchronously, after we
|
|
@@ -2413,6 +2081,8 @@ pub fn ReplicaType(
|
|
|
2413
2081
|
assert(self.committing);
|
|
2414
2082
|
assert(self.status == .normal or self.status == .view_change or
|
|
2415
2083
|
(self.status == .recovering and self.replica_count == 1));
|
|
2084
|
+
assert(!(self.status == .normal and self.primary()));
|
|
2085
|
+
assert(self.pipeline == .cache);
|
|
2416
2086
|
assert(self.commit_min <= self.commit_max);
|
|
2417
2087
|
assert(self.commit_min <= self.op);
|
|
2418
2088
|
|
|
@@ -2427,8 +2097,23 @@ pub fn ReplicaType(
|
|
|
2427
2097
|
// Even a naive state transfer may fail to correct for this.
|
|
2428
2098
|
if (self.commit_min < self.commit_max and self.commit_min < self.op) {
|
|
2429
2099
|
const op = self.commit_min + 1;
|
|
2430
|
-
const
|
|
2431
|
-
|
|
2100
|
+
const header = self.journal.header_with_op(op).?;
|
|
2101
|
+
|
|
2102
|
+
if (self.pipeline.cache.prepare_by_op_and_checksum(op, header.checksum)) |prepare| {
|
|
2103
|
+
log.debug("{}: commit_journal_next: cached prepare op={} checksum={}", .{
|
|
2104
|
+
self.replica,
|
|
2105
|
+
op,
|
|
2106
|
+
header.checksum,
|
|
2107
|
+
});
|
|
2108
|
+
self.commit_journal_next_callback(prepare, null);
|
|
2109
|
+
} else {
|
|
2110
|
+
self.journal.read_prepare(
|
|
2111
|
+
commit_journal_next_callback,
|
|
2112
|
+
op,
|
|
2113
|
+
header.checksum,
|
|
2114
|
+
null,
|
|
2115
|
+
);
|
|
2116
|
+
}
|
|
2432
2117
|
} else {
|
|
2433
2118
|
self.commit_ops_done();
|
|
2434
2119
|
// This is an optimization to expedite the view change before the `repair_timeout`:
|
|
@@ -2438,7 +2123,7 @@ pub fn ReplicaType(
|
|
|
2438
2123
|
assert(self.replica_count == 1);
|
|
2439
2124
|
assert(self.commit_min == self.commit_max);
|
|
2440
2125
|
assert(self.commit_min == self.op);
|
|
2441
|
-
self.transition_to_normal_from_recovering_status(
|
|
2126
|
+
self.transition_to_normal_from_recovering_status();
|
|
2442
2127
|
} else {
|
|
2443
2128
|
// We expect that a cluster-of-one only calls commit_journal() in recovering status.
|
|
2444
2129
|
assert(self.replica_count > 1);
|
|
@@ -2457,14 +2142,6 @@ pub fn ReplicaType(
|
|
|
2457
2142
|
return;
|
|
2458
2143
|
}
|
|
2459
2144
|
|
|
2460
|
-
const slot = self.journal.slot_with_op_and_checksum(
|
|
2461
|
-
prepare.?.header.op,
|
|
2462
|
-
prepare.?.header.checksum,
|
|
2463
|
-
).?;
|
|
2464
|
-
assert(self.journal.prepare_inhabited[slot.index]);
|
|
2465
|
-
assert(self.journal.prepare_checksums[slot.index] == prepare.?.header.checksum);
|
|
2466
|
-
assert(self.journal.has(prepare.?.header));
|
|
2467
|
-
|
|
2468
2145
|
switch (self.status) {
|
|
2469
2146
|
.normal => {},
|
|
2470
2147
|
.view_change => {
|
|
@@ -2484,6 +2161,7 @@ pub fn ReplicaType(
|
|
|
2484
2161
|
assert(self.replica_count == 1);
|
|
2485
2162
|
assert(self.primary_index(self.view) == self.replica);
|
|
2486
2163
|
},
|
|
2164
|
+
.recovering_head => unreachable,
|
|
2487
2165
|
}
|
|
2488
2166
|
|
|
2489
2167
|
const op = self.commit_min + 1;
|
|
@@ -2497,7 +2175,15 @@ pub fn ReplicaType(
|
|
|
2497
2175
|
assert(self.commit_min <= self.commit_max);
|
|
2498
2176
|
assert(self.commit_min <= self.op);
|
|
2499
2177
|
|
|
2500
|
-
self.
|
|
2178
|
+
if (self.status == .normal and self.primary()) {
|
|
2179
|
+
if (self.pipeline.queue.prepare_queue.empty()) {
|
|
2180
|
+
self.commit_ops_done();
|
|
2181
|
+
} else {
|
|
2182
|
+
self.commit_pipeline_next();
|
|
2183
|
+
}
|
|
2184
|
+
} else {
|
|
2185
|
+
self.commit_journal_next();
|
|
2186
|
+
}
|
|
2501
2187
|
}
|
|
2502
2188
|
|
|
2503
2189
|
/// Begin the commit path that is common between `commit_pipeline` and `commit_journal`:
|
|
@@ -2551,8 +2237,14 @@ pub fn ReplicaType(
|
|
|
2551
2237
|
assert(self.commit_min <= self.commit_max);
|
|
2552
2238
|
|
|
2553
2239
|
if (self.status == .normal and self.primary()) {
|
|
2554
|
-
const prepare = self.pipeline.
|
|
2240
|
+
const prepare = self.pipeline.queue.pop_prepare().?;
|
|
2241
|
+
if (self.pipeline.queue.pop_request()) |request| {
|
|
2242
|
+
// Start preparing the next request in the queue (if any).
|
|
2243
|
+
self.primary_pipeline_prepare(request);
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2555
2246
|
assert(self.commit_min == self.commit_max);
|
|
2247
|
+
assert(prepare.message.header.command == .prepare);
|
|
2556
2248
|
assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
|
|
2557
2249
|
assert(prepare.message.header.op == self.commit_min);
|
|
2558
2250
|
assert(prepare.message.header.op == self.commit_max);
|
|
@@ -2560,7 +2252,7 @@ pub fn ReplicaType(
|
|
|
2560
2252
|
|
|
2561
2253
|
self.message_bus.unref(prepare.message);
|
|
2562
2254
|
|
|
2563
|
-
if (self.pipeline.head_ptr()) |next| {
|
|
2255
|
+
if (self.pipeline.queue.prepare_queue.head_ptr()) |next| {
|
|
2564
2256
|
assert(next.message.header.op == self.commit_min + 1);
|
|
2565
2257
|
assert(next.message.header.op == self.commit_prepare.?.header.op + 1);
|
|
2566
2258
|
|
|
@@ -2588,8 +2280,8 @@ pub fn ReplicaType(
|
|
|
2588
2280
|
const self = @fieldParentPtr(Self, "state_machine", state_machine);
|
|
2589
2281
|
assert(self.committing);
|
|
2590
2282
|
assert(self.commit_callback != null);
|
|
2591
|
-
assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
|
|
2592
|
-
assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
|
|
2283
|
+
assert(self.op_checkpoint() == self.superblock.staging.vsr_state.commit_min);
|
|
2284
|
+
assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
|
|
2593
2285
|
|
|
2594
2286
|
const op = self.commit_prepare.?.header.op;
|
|
2595
2287
|
assert(op == self.commit_min);
|
|
@@ -2604,7 +2296,7 @@ pub fn ReplicaType(
|
|
|
2604
2296
|
"(op={} current_checkpoint={} next_checkpoint={})", .{
|
|
2605
2297
|
self.replica,
|
|
2606
2298
|
self.op,
|
|
2607
|
-
self.op_checkpoint,
|
|
2299
|
+
self.op_checkpoint(),
|
|
2608
2300
|
self.op_checkpoint_next(),
|
|
2609
2301
|
});
|
|
2610
2302
|
tracer.start(
|
|
@@ -2638,19 +2330,15 @@ pub fn ReplicaType(
|
|
|
2638
2330
|
// Therefore, only ops "A..D" are committed to disk.
|
|
2639
2331
|
// Thus, the SuperBlock's `commit_min` is set to 7-2=5.
|
|
2640
2332
|
const vsr_state_commit_min = self.op_checkpoint_next();
|
|
2641
|
-
const vsr_state_new = .{
|
|
2642
|
-
.commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
|
|
2643
|
-
.commit_min = vsr_state_commit_min,
|
|
2644
|
-
.commit_max = self.commit_max,
|
|
2645
|
-
.view_normal = self.view_normal,
|
|
2646
|
-
.view = self.view,
|
|
2647
|
-
};
|
|
2648
|
-
assert(self.superblock.working.vsr_state.monotonic(vsr_state_new));
|
|
2649
2333
|
|
|
2650
2334
|
self.superblock.checkpoint(
|
|
2651
2335
|
commit_op_checkpoint_superblock_callback,
|
|
2652
2336
|
&self.superblock_context,
|
|
2653
|
-
|
|
2337
|
+
.{
|
|
2338
|
+
.commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
|
|
2339
|
+
.commit_min = vsr_state_commit_min,
|
|
2340
|
+
.commit_max = self.commit_max,
|
|
2341
|
+
},
|
|
2654
2342
|
);
|
|
2655
2343
|
}
|
|
2656
2344
|
|
|
@@ -2661,15 +2349,14 @@ pub fn ReplicaType(
|
|
|
2661
2349
|
assert(self.commit_prepare.?.header.op == self.op);
|
|
2662
2350
|
assert(self.commit_prepare.?.header.op == self.commit_min);
|
|
2663
2351
|
|
|
2664
|
-
self.op_checkpoint
|
|
2665
|
-
assert(self.op_checkpoint == self.commit_min
|
|
2666
|
-
assert(self.op_checkpoint == self.superblock.
|
|
2667
|
-
assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
|
|
2352
|
+
assert(self.op_checkpoint() == self.commit_min - constants.lsm_batch_multiple);
|
|
2353
|
+
assert(self.op_checkpoint() == self.superblock.staging.vsr_state.commit_min);
|
|
2354
|
+
assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
|
|
2668
2355
|
|
|
2669
2356
|
log.debug("{}: commit_op_compact_callback: checkpoint done (op={} new_checkpoint={})", .{
|
|
2670
2357
|
self.replica,
|
|
2671
2358
|
self.op,
|
|
2672
|
-
self.op_checkpoint,
|
|
2359
|
+
self.op_checkpoint(),
|
|
2673
2360
|
});
|
|
2674
2361
|
tracer.end(
|
|
2675
2362
|
&self.tracer_slot_checkpoint,
|
|
@@ -2720,7 +2407,7 @@ pub fn ReplicaType(
|
|
|
2720
2407
|
// this commit.
|
|
2721
2408
|
|
|
2722
2409
|
assert(self.journal.has(prepare.header));
|
|
2723
|
-
if (self.op_checkpoint == self.commit_min) {
|
|
2410
|
+
if (self.op_checkpoint() == self.commit_min) {
|
|
2724
2411
|
// op_checkpoint's slot may have been overwritten in the WAL — but we can
|
|
2725
2412
|
// always use the VSRState to anchor the hash chain.
|
|
2726
2413
|
assert(prepare.header.parent ==
|
|
@@ -2752,6 +2439,7 @@ pub fn ReplicaType(
|
|
|
2752
2439
|
const reply_body_size = @intCast(u32, self.state_machine.commit(
|
|
2753
2440
|
prepare.header.client,
|
|
2754
2441
|
prepare.header.op,
|
|
2442
|
+
prepare.header.timestamp,
|
|
2755
2443
|
prepare.header.operation.cast(StateMachine),
|
|
2756
2444
|
prepare.buffer[@sizeOf(Header)..prepare.header.size],
|
|
2757
2445
|
reply.buffer[@sizeOf(Header)..],
|
|
@@ -2788,7 +2476,7 @@ pub fn ReplicaType(
|
|
|
2788
2476
|
if (self.superblock.working.vsr_state.op_compacted(prepare.header.op)) {
|
|
2789
2477
|
// We are recovering from a checkpoint. Prior to the crash, the client table was
|
|
2790
2478
|
// updated with entries for one bar beyond the op_checkpoint.
|
|
2791
|
-
assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
|
|
2479
|
+
assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
|
|
2792
2480
|
if (self.client_table().get(prepare.header.client)) |entry| {
|
|
2793
2481
|
assert(entry.reply.header.command == .reply);
|
|
2794
2482
|
assert(entry.reply.header.op >= prepare.header.op);
|
|
@@ -2799,7 +2487,7 @@ pub fn ReplicaType(
|
|
|
2799
2487
|
log.debug("{}: commit_op: skip client table update: prepare.op={} checkpoint={}", .{
|
|
2800
2488
|
self.replica,
|
|
2801
2489
|
prepare.header.op,
|
|
2802
|
-
self.op_checkpoint,
|
|
2490
|
+
self.op_checkpoint(),
|
|
2803
2491
|
});
|
|
2804
2492
|
} else {
|
|
2805
2493
|
if (reply.header.operation == .register) {
|
|
@@ -2821,7 +2509,7 @@ pub fn ReplicaType(
|
|
|
2821
2509
|
fn commit_pipeline(self: *Self) void {
|
|
2822
2510
|
assert(self.status == .normal);
|
|
2823
2511
|
assert(self.primary());
|
|
2824
|
-
assert(self.pipeline.count > 0);
|
|
2512
|
+
assert(self.pipeline.queue.prepare_queue.count > 0);
|
|
2825
2513
|
|
|
2826
2514
|
// Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
|
|
2827
2515
|
if (self.committing) {
|
|
@@ -2838,10 +2526,10 @@ pub fn ReplicaType(
|
|
|
2838
2526
|
assert(self.status == .normal);
|
|
2839
2527
|
assert(self.primary());
|
|
2840
2528
|
|
|
2841
|
-
if (self.pipeline.head_ptr()) |prepare| {
|
|
2529
|
+
if (self.pipeline.queue.prepare_queue.head_ptr()) |prepare| {
|
|
2842
2530
|
assert(self.commit_min == self.commit_max);
|
|
2843
2531
|
assert(self.commit_min + 1 == prepare.message.header.op);
|
|
2844
|
-
assert(self.commit_min + self.pipeline.count == self.op);
|
|
2532
|
+
assert(self.commit_min + self.pipeline.queue.prepare_queue.count == self.op);
|
|
2845
2533
|
assert(self.journal.has(prepare.message.header));
|
|
2846
2534
|
|
|
2847
2535
|
if (!prepare.ok_quorum_received) {
|
|
@@ -2867,9 +2555,6 @@ pub fn ReplicaType(
|
|
|
2867
2555
|
assert(self.commit_min <= self.op);
|
|
2868
2556
|
|
|
2869
2557
|
if (self.status == .normal and self.primary()) {
|
|
2870
|
-
if (self.pipeline.head_ptr()) |pipeline_head| {
|
|
2871
|
-
assert(pipeline_head.message.header.op == self.commit_min + 1);
|
|
2872
|
-
}
|
|
2873
2558
|
self.commit_pipeline_next();
|
|
2874
2559
|
} else {
|
|
2875
2560
|
self.commit_ops_done();
|
|
@@ -2890,10 +2575,7 @@ pub fn ReplicaType(
|
|
|
2890
2575
|
) usize {
|
|
2891
2576
|
assert(op_max >= op_min);
|
|
2892
2577
|
assert(count_max == null or count_max.? > 0);
|
|
2893
|
-
assert(message.header.command == .
|
|
2894
|
-
message.header.command == .start_view or
|
|
2895
|
-
message.header.command == .headers or
|
|
2896
|
-
message.header.command == .recovery_response);
|
|
2578
|
+
assert(message.header.command == .headers);
|
|
2897
2579
|
|
|
2898
2580
|
const body_size_max = @sizeOf(Header) * std.math.min(
|
|
2899
2581
|
@divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
|
|
@@ -2934,7 +2616,6 @@ pub fn ReplicaType(
|
|
|
2934
2616
|
assert(m.header.replica == replica);
|
|
2935
2617
|
switch (command) {
|
|
2936
2618
|
.do_view_change => assert(m.header.view == self.view),
|
|
2937
|
-
.recovery_response => assert(m.header.replica != self.replica),
|
|
2938
2619
|
else => unreachable,
|
|
2939
2620
|
}
|
|
2940
2621
|
count += 1;
|
|
@@ -3021,17 +2702,30 @@ pub fn ReplicaType(
|
|
|
3021
2702
|
assert(self.client_table().count() <= constants.clients_max);
|
|
3022
2703
|
}
|
|
3023
2704
|
|
|
2705
|
+
/// Construct a SV/DVC message, including attached headers from the current log_view.
|
|
2706
|
+
///
|
|
3024
2707
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
3025
2708
|
fn create_view_change_message(self: *Self, command: Command) *Message {
|
|
3026
|
-
assert(command == .do_view_change or command == .start_view);
|
|
3027
|
-
|
|
3028
2709
|
// We may send a start_view message in normal status to resolve a backup's view jump:
|
|
3029
2710
|
assert(self.status == .normal or self.status == .view_change);
|
|
2711
|
+
assert((self.status == .normal) == (command == .start_view));
|
|
2712
|
+
assert((self.status == .view_change) == (command == .do_view_change));
|
|
2713
|
+
assert(self.view >= self.log_view);
|
|
2714
|
+
assert(self.view >= self.view_durable());
|
|
2715
|
+
assert(self.log_view >= self.log_view_durable());
|
|
2716
|
+
|
|
2717
|
+
assert(command != .do_view_change or self.log_view < self.view);
|
|
2718
|
+
assert(command != .start_view or self.log_view == self.view);
|
|
3030
2719
|
|
|
3031
2720
|
const message = self.message_bus.get_message();
|
|
3032
2721
|
defer self.message_bus.unref(message);
|
|
3033
2722
|
|
|
2723
|
+
const headers = self.create_view_change_headers();
|
|
2724
|
+
assert(headers.len > 0);
|
|
2725
|
+
assert(headers.get(0).op == self.op);
|
|
2726
|
+
|
|
3034
2727
|
message.header.* = .{
|
|
2728
|
+
.size = @intCast(u32, @sizeOf(Header) * (1 + headers.len)),
|
|
3035
2729
|
.command = command,
|
|
3036
2730
|
.cluster = self.cluster,
|
|
3037
2731
|
.replica = self.replica,
|
|
@@ -3040,33 +2734,167 @@ pub fn ReplicaType(
|
|
|
3040
2734
|
// number contained in the prepare headers we include in the body. The former shows
|
|
3041
2735
|
// how recent a view change the replica participated in, which may be much higher.
|
|
3042
2736
|
// We use the `timestamp` field to send this in addition to the current view number:
|
|
3043
|
-
.timestamp = if (command == .do_view_change) self.
|
|
2737
|
+
.timestamp = if (command == .do_view_change) self.log_view else 0,
|
|
3044
2738
|
.op = self.op,
|
|
3045
2739
|
// See the comment in `on_do_view_change()` for why `commit_min` is crucial:
|
|
3046
2740
|
.commit = if (command == .do_view_change) self.commit_min else self.commit_max,
|
|
3047
2741
|
};
|
|
3048
2742
|
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
2743
|
+
stdx.copy_disjoint(
|
|
2744
|
+
.exact,
|
|
2745
|
+
Header,
|
|
2746
|
+
std.mem.bytesAsSlice(Header, message.body()),
|
|
2747
|
+
headers.constSlice(),
|
|
3054
2748
|
);
|
|
3055
|
-
assert(count > 0); // We expect that self.op always exists.
|
|
3056
|
-
assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
|
|
3057
|
-
|
|
3058
2749
|
message.header.set_checksum_body(message.body());
|
|
3059
2750
|
message.header.set_checksum();
|
|
3060
2751
|
|
|
3061
2752
|
return message.ref();
|
|
3062
2753
|
}
|
|
3063
2754
|
|
|
2755
|
+
fn create_view_change_headers(self: *const Self) vsr.ViewChangeHeaders.BoundedArray {
|
|
2756
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
2757
|
+
assert(self.view >= self.log_view);
|
|
2758
|
+
assert(self.view >= self.view_durable());
|
|
2759
|
+
assert(self.log_view >= self.log_view_durable());
|
|
2760
|
+
|
|
2761
|
+
var headers = vsr.ViewChangeHeaders.BoundedArray{ .buffer = undefined };
|
|
2762
|
+
|
|
2763
|
+
// Always include the head message.
|
|
2764
|
+
headers.appendAssumeCapacity(self.journal.header_with_op(self.op).?.*);
|
|
2765
|
+
|
|
2766
|
+
if (self.view == self.log_view) {
|
|
2767
|
+
// Construct SV message headers. (On the backup, these are only stored in the
|
|
2768
|
+
// superblock).
|
|
2769
|
+
if (self.primary_index(self.view) == self.replica and self.status == .normal) {
|
|
2770
|
+
assert(self.op >= self.commit_max);
|
|
2771
|
+
|
|
2772
|
+
// The primary starting a new view has a pristine log suffix.
|
|
2773
|
+
//
|
|
2774
|
+
// +1 because commit_min may have been overwritten (and not repaired) if it
|
|
2775
|
+
// falls on a checkpoint boundary.
|
|
2776
|
+
var op = self.op;
|
|
2777
|
+
while (op > self.commit_min + 1) : (op -= 1) {
|
|
2778
|
+
const header_next = self.journal.header_with_op(op).?;
|
|
2779
|
+
const header_prev = self.journal.header_with_op(op - 1).?;
|
|
2780
|
+
assert(header_prev.checksum == header_next.parent);
|
|
2781
|
+
|
|
2782
|
+
headers.append(header_prev.*) catch break;
|
|
2783
|
+
}
|
|
2784
|
+
} else {
|
|
2785
|
+
// Either:
|
|
2786
|
+
// - The primary started a new view but has not finished repair.
|
|
2787
|
+
// - The backup joining a new view has a pristine log suffix — it just
|
|
2788
|
+
// loaded a SV.
|
|
2789
|
+
//
|
|
2790
|
+
// In each case we send as much of a suffix as is available (fallthrough).
|
|
2791
|
+
}
|
|
2792
|
+
} else {
|
|
2793
|
+
// Construct DVC message headers.
|
|
2794
|
+
assert(self.view > self.log_view);
|
|
2795
|
+
|
|
2796
|
+
if (self.log_view_durable() == self.log_view) {
|
|
2797
|
+
const headers_durable = self.superblock.working.vsr_headers().slice;
|
|
2798
|
+
assert(headers_durable[0].op <= self.op);
|
|
2799
|
+
|
|
2800
|
+
if (self.log_view_durable() < self.view_durable()) {
|
|
2801
|
+
// Ensure that if we started a DVC before a crash, that we will resume
|
|
2802
|
+
// sending the exact same DVC after recovery.
|
|
2803
|
+
// (An alternative implementation would be to load the superblock's DVC
|
|
2804
|
+
// headers (including gaps) into the journal during open(), but that is more
|
|
2805
|
+
// complicated to implement correctly).
|
|
2806
|
+
assert(headers_durable[0].op == self.op);
|
|
2807
|
+
assert(headers_durable[0].checksum == headers.get(0).checksum);
|
|
2808
|
+
|
|
2809
|
+
for (headers_durable[1..]) |*header| headers.appendAssumeCapacity(header.*);
|
|
2810
|
+
} else {
|
|
2811
|
+
// Durable SV anchor. See Example 4.
|
|
2812
|
+
assert(self.log_view_durable() == self.view_durable());
|
|
2813
|
+
|
|
2814
|
+
var op = self.op;
|
|
2815
|
+
while (op > headers_durable[headers_durable.len - 1].op) : (op -= 1) {
|
|
2816
|
+
const header_prev = self.journal.header_with_op(op - 1) orelse continue;
|
|
2817
|
+
const header_next = self.journal.header_with_op(op);
|
|
2818
|
+
assert(header_next == null or header_prev.checksum == header_next.?.parent);
|
|
2819
|
+
|
|
2820
|
+
headers.append(header_prev.*) catch break;
|
|
2821
|
+
}
|
|
2822
|
+
}
|
|
2823
|
+
return headers;
|
|
2824
|
+
}
|
|
2825
|
+
|
|
2826
|
+
// The DVC anchor: Within the log suffix following the anchor, we have additional
|
|
2827
|
+
// guarantees about the state of the log headers which allow us to tolerate certain
|
|
2828
|
+
// gaps (by locally guaranteeing that the gap does not hide a break).
|
|
2829
|
+
// See Example 2/3 for more detail.
|
|
2830
|
+
const op_dvc_anchor = std.math.max(
|
|
2831
|
+
self.commit_min,
|
|
2832
|
+
// +1: We can have a full pipeline, but not yet have performed any repair.
|
|
2833
|
+
// In such a case, we want to send those pipeline_prepare_queue_max headers in
|
|
2834
|
+
// the DVC, but not the preceding op (which may belong to a different chain).
|
|
2835
|
+
// This satisfies the DVC invariant because the first op in the pipeline is
|
|
2836
|
+
// "connected" to the canonical chain (via its "parent" checksum).
|
|
2837
|
+
//
|
|
2838
|
+
// For example, as a follower, we might have received pipeline_prepare_queue_max
|
|
2839
|
+
// headers in the SV message, but not done any repair before the next view
|
|
2840
|
+
// change.
|
|
2841
|
+
1 + self.op -| constants.pipeline_prepare_queue_max,
|
|
2842
|
+
);
|
|
2843
|
+
|
|
2844
|
+
if (self.primary_index(self.log_view) == self.replica) {
|
|
2845
|
+
// Retired primary: see Example 2a.
|
|
2846
|
+
var op = self.op;
|
|
2847
|
+
while (op > op_dvc_anchor) : (op -= 1) {
|
|
2848
|
+
const header_next = self.journal.header_with_op(op).?;
|
|
2849
|
+
// Exclude gaps since we cannot distinguish the gap from a break.
|
|
2850
|
+
const header_prev = self.journal.header_with_op(op - 1) orelse break;
|
|
2851
|
+
if (header_prev.checksum != header_next.parent) break;
|
|
2852
|
+
|
|
2853
|
+
headers.append(header_prev.*) catch break;
|
|
2854
|
+
}
|
|
2855
|
+
} else {
|
|
2856
|
+
// Retired backup: see Example 2b.
|
|
2857
|
+
var op = self.op;
|
|
2858
|
+
while (op > self.commit_min) : (op -= 1) {
|
|
2859
|
+
const header_prev = self.journal.header_with_op(op - 1) orelse continue;
|
|
2860
|
+
const header_next = self.journal.header_with_op(op);
|
|
2861
|
+
assert(header_next == null or header_prev.checksum == header_next.?.parent);
|
|
2862
|
+
|
|
2863
|
+
headers.append(header_prev.*) catch break;
|
|
2864
|
+
|
|
2865
|
+
// Stop once we connect to the anchor.
|
|
2866
|
+
if (header_prev.op <= op_dvc_anchor + 1) break;
|
|
2867
|
+
} else {
|
|
2868
|
+
assert(self.commit_min == self.op);
|
|
2869
|
+
}
|
|
2870
|
+
}
|
|
2871
|
+
}
|
|
2872
|
+
|
|
2873
|
+
// Include as many extra headers as possible, but with no additional gaps (since they
|
|
2874
|
+
// cannot be differentiated from breaks).
|
|
2875
|
+
// - This reduces the number of headers that the new primary will need to repair.
|
|
2876
|
+
// - More importantly, this ensures that a replica which re-sends its DVC does not
|
|
2877
|
+
// alter the DVC's headers, even if the replica finished a commit (updating
|
|
2878
|
+
// commit_min, possibly modifying the suffix anchor) in the mean time.
|
|
2879
|
+
// (This is not required for correctness, but enables additional verification
|
|
2880
|
+
// in on_do_view_change().)
|
|
2881
|
+
var op = headers.get(headers.len - 1).op;
|
|
2882
|
+
while (op > 0 and headers.len < constants.view_change_headers_max) : (op -= 1) {
|
|
2883
|
+
const header_next = self.journal.header_with_op(op).?;
|
|
2884
|
+
const header_prev = self.journal.header_with_op(op - 1) orelse break;
|
|
2885
|
+
if (header_prev.checksum != header_next.parent) break;
|
|
2886
|
+
|
|
2887
|
+
headers.appendAssumeCapacity(header_prev.*);
|
|
2888
|
+
}
|
|
2889
|
+
|
|
2890
|
+
vsr.ViewChangeHeaders.verify(headers.constSlice());
|
|
2891
|
+
return headers;
|
|
2892
|
+
}
|
|
2893
|
+
|
|
3064
2894
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
3065
2895
|
fn create_message_from_header(self: *Self, header: Header) *Message {
|
|
3066
2896
|
assert(header.replica == self.replica);
|
|
3067
|
-
assert(header.view == self.view or
|
|
3068
|
-
header.command == .request_start_view or
|
|
3069
|
-
header.command == .recovery);
|
|
2897
|
+
assert(header.view == self.view or header.command == .request_start_view);
|
|
3070
2898
|
assert(header.size == @sizeOf(Header));
|
|
3071
2899
|
|
|
3072
2900
|
const message = self.message_bus.pool.get_message();
|
|
@@ -3079,67 +2907,6 @@ pub fn ReplicaType(
|
|
|
3079
2907
|
return message.ref();
|
|
3080
2908
|
}
|
|
3081
2909
|
|
|
3082
|
-
/// Returns the op of the highest canonical message, according to this replica (the new
|
|
3083
|
-
/// primary) prior to loading the current view change's DVC quorum headers.
|
|
3084
|
-
/// When this replica participated in the last `view_normal`, this is just `replica.op`.
|
|
3085
|
-
///
|
|
3086
|
-
/// - A *canonical* message was part of the last view_normal.
|
|
3087
|
-
/// - An *uncanonical* message may have been removed/changed by a prior view.
|
|
3088
|
-
/// - Canonical messages do not necessarily survive into the new view, but they take
|
|
3089
|
-
/// precedence over uncanonical messages.
|
|
3090
|
-
/// - Canonical messages may be committed or uncommitted.
|
|
3091
|
-
///
|
|
3092
|
-
/// Consider these logs:
|
|
3093
|
-
///
|
|
3094
|
-
/// replica 0: 4, 5, 6b, 7b, 8b (commit_min=6b, primary, status=normal, view=X)
|
|
3095
|
-
/// replica 1: 4, 5, 6b, --, -- (commit_min=5, backup, status=normal, view=X)
|
|
3096
|
-
/// replica 2: 4, 5, 6a, --, 8b (view<X)
|
|
3097
|
-
///
|
|
3098
|
-
/// 1. Replica 0 crashes immediately after committing 6b.
|
|
3099
|
-
/// 2. Replicas 1 and 2 must determine the new chain HEAD.
|
|
3100
|
-
/// 3. 8b is discarded due to the gap in 7.
|
|
3101
|
-
/// 4. To distinguish between 6a and 6b (and safely discard 6a), the new primary trusts ops
|
|
3102
|
-
/// from the DVC(s) with the greatest `view_normal`.
|
|
3103
|
-
fn primary_op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
|
|
3104
|
-
assert(self.replica_count > 1);
|
|
3105
|
-
assert(self.status == .view_change);
|
|
3106
|
-
assert(self.primary_index(self.view) == self.replica);
|
|
3107
|
-
assert(self.do_view_change_quorum);
|
|
3108
|
-
assert(!self.repair_timeout.ticking);
|
|
3109
|
-
assert(self.journal.header_with_op(self.op) != null);
|
|
3110
|
-
assert(self.view_normal <= view_normal_canonical);
|
|
3111
|
-
|
|
3112
|
-
if (self.view_normal == view_normal_canonical) return self.op;
|
|
3113
|
-
|
|
3114
|
-
const uncanonical_op_count = std.math.min(
|
|
3115
|
-
// Do not reset any ops that we have already committed.
|
|
3116
|
-
self.op - self.commit_min,
|
|
3117
|
-
// The number of uncommitted ops cannot be more than the length of the pipeline.
|
|
3118
|
-
// Do not reset any ops that we did not include in our do_view_change message.
|
|
3119
|
-
constants.pipeline_max,
|
|
3120
|
-
);
|
|
3121
|
-
|
|
3122
|
-
assert(uncanonical_op_count <= constants.pipeline_max);
|
|
3123
|
-
if (uncanonical_op_count == 0) return self.op;
|
|
3124
|
-
|
|
3125
|
-
// * When uncanonical_op_count = self.op - self.commit_min,
|
|
3126
|
-
// self.op - uncanonical_op_count = self.commit_min.
|
|
3127
|
-
// * When uncanonical_op_count = constants.pipeline_max,
|
|
3128
|
-
// constants.pipeline_max < self.op - self.commit_min holds.
|
|
3129
|
-
const canonical_op_max = self.op - uncanonical_op_count;
|
|
3130
|
-
|
|
3131
|
-
log.debug("{}: on_do_view_change: not canonical ops={}..{}", .{
|
|
3132
|
-
self.replica,
|
|
3133
|
-
canonical_op_max + 1,
|
|
3134
|
-
self.op,
|
|
3135
|
-
});
|
|
3136
|
-
|
|
3137
|
-
assert(canonical_op_max <= self.op);
|
|
3138
|
-
assert(canonical_op_max >= self.commit_min);
|
|
3139
|
-
assert(canonical_op_max + constants.pipeline_max >= self.op);
|
|
3140
|
-
return canonical_op_max;
|
|
3141
|
-
}
|
|
3142
|
-
|
|
3143
2910
|
/// Discards uncommitted ops during a view change from after and including `op`.
|
|
3144
2911
|
/// This is required to maximize availability in the presence of storage faults.
|
|
3145
2912
|
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
@@ -3192,8 +2959,8 @@ pub fn ReplicaType(
|
|
|
3192
2959
|
// asynchronous prepare_ok to itself.
|
|
3193
2960
|
// 3. In on_start_view_change(), after receiving a quorum of start_view_change
|
|
3194
2961
|
// messages, the new primary sends a synchronous do_view_change to itself.
|
|
3195
|
-
// 4. In
|
|
3196
|
-
// message for each uncommitted message.
|
|
2962
|
+
// 4. In primary_start_view_as_the_new_primary(), the new primary sends itself a
|
|
2963
|
+
// prepare_ok message for each uncommitted message.
|
|
3197
2964
|
if (self.loopback_queue) |message| {
|
|
3198
2965
|
defer self.message_bus.unref(message);
|
|
3199
2966
|
|
|
@@ -3278,7 +3045,8 @@ pub fn ReplicaType(
|
|
|
3278
3045
|
log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
|
|
3279
3046
|
return true;
|
|
3280
3047
|
},
|
|
3281
|
-
|
|
3048
|
+
.headers, .request_headers => {},
|
|
3049
|
+
else => unreachable,
|
|
3282
3050
|
}
|
|
3283
3051
|
}
|
|
3284
3052
|
|
|
@@ -3344,13 +3112,14 @@ pub fn ReplicaType(
|
|
|
3344
3112
|
if (self.ignore_request_message_duplicate(message)) return true;
|
|
3345
3113
|
if (self.ignore_request_message_preparing(message)) return true;
|
|
3346
3114
|
|
|
3347
|
-
//
|
|
3348
|
-
// The
|
|
3349
|
-
|
|
3115
|
+
// Don't accept more requests than will fit in the current checkpoint.
|
|
3116
|
+
// (The request's op hasn't been assigned yet, but it will be `self.op + 1`
|
|
3117
|
+
// when primary_pipeline_next() converts the request to a prepare.)
|
|
3118
|
+
if (self.op + self.pipeline.queue.request_queue.count == self.op_checkpoint_trigger()) {
|
|
3350
3119
|
log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint_trigger={})", .{
|
|
3351
3120
|
self.replica,
|
|
3352
3121
|
self.op + 1,
|
|
3353
|
-
self.op_checkpoint,
|
|
3122
|
+
self.op_checkpoint(),
|
|
3354
3123
|
});
|
|
3355
3124
|
return true;
|
|
3356
3125
|
}
|
|
@@ -3419,7 +3188,7 @@ pub fn ReplicaType(
|
|
|
3419
3188
|
} else if (message.header.operation == .register) {
|
|
3420
3189
|
log.debug("{}: on_request: new session", .{self.replica});
|
|
3421
3190
|
return false;
|
|
3422
|
-
} else if (self.
|
|
3191
|
+
} else if (self.pipeline.queue.message_by_client(message.header.client)) |_| {
|
|
3423
3192
|
// The client registered with the previous primary, which committed and replied back
|
|
3424
3193
|
// to the client before the view change, after which the register operation was
|
|
3425
3194
|
// reloaded into the pipeline to be driven to completion by the new primary, which
|
|
@@ -3491,21 +3260,31 @@ pub fn ReplicaType(
|
|
|
3491
3260
|
assert(message.header.client > 0);
|
|
3492
3261
|
assert(message.header.view <= self.view); // See ignore_request_message_backup().
|
|
3493
3262
|
|
|
3494
|
-
if (self.
|
|
3495
|
-
assert(
|
|
3496
|
-
assert(
|
|
3497
|
-
|
|
3263
|
+
if (self.pipeline.queue.message_by_client(message.header.client)) |pipeline_message| {
|
|
3264
|
+
assert(pipeline_message.header.client == message.header.client);
|
|
3265
|
+
assert(pipeline_message.header.command == .request or
|
|
3266
|
+
pipeline_message.header.command == .prepare);
|
|
3498
3267
|
|
|
3499
|
-
if (
|
|
3500
|
-
|
|
3268
|
+
if (pipeline_message.header.command == .request and
|
|
3269
|
+
pipeline_message.header.checksum == message.header.checksum)
|
|
3270
|
+
{
|
|
3271
|
+
log.debug("{}: on_request: ignoring (already queued)", .{self.replica});
|
|
3501
3272
|
return true;
|
|
3502
|
-
}
|
|
3503
|
-
|
|
3273
|
+
}
|
|
3274
|
+
|
|
3275
|
+
if (pipeline_message.header.command == .prepare and
|
|
3276
|
+
pipeline_message.header.context == message.header.checksum)
|
|
3277
|
+
{
|
|
3278
|
+
assert(pipeline_message.header.op > self.commit_max);
|
|
3279
|
+
log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
|
|
3504
3280
|
return true;
|
|
3505
3281
|
}
|
|
3282
|
+
|
|
3283
|
+
log.err("{}: on_request: ignoring (client forked)", .{self.replica});
|
|
3284
|
+
return true;
|
|
3506
3285
|
}
|
|
3507
3286
|
|
|
3508
|
-
if (self.pipeline.full()) {
|
|
3287
|
+
if (self.pipeline.queue.full()) {
|
|
3509
3288
|
log.debug("{}: on_request: ignoring (pipeline full)", .{self.replica});
|
|
3510
3289
|
return true;
|
|
3511
3290
|
}
|
|
@@ -3521,7 +3300,10 @@ pub fn ReplicaType(
|
|
|
3521
3300
|
|
|
3522
3301
|
const command: []const u8 = @tagName(message.header.command);
|
|
3523
3302
|
|
|
3524
|
-
|
|
3303
|
+
if (self.status == .recovering_head and message.header.command != .start_view) {
|
|
3304
|
+
return true;
|
|
3305
|
+
}
|
|
3306
|
+
|
|
3525
3307
|
// While a replica's status is recovering it does not participate in either the request
|
|
3526
3308
|
// processing protocol or the view change protocol.
|
|
3527
3309
|
// This is critical for correctness (to avoid data loss):
|
|
@@ -3614,28 +3396,7 @@ pub fn ReplicaType(
|
|
|
3614
3396
|
assert(self.journal.header_with_op(self.op) == null);
|
|
3615
3397
|
}
|
|
3616
3398
|
|
|
3617
|
-
|
|
3618
|
-
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
3619
|
-
assert(message.header.command == .do_view_change or
|
|
3620
|
-
message.header.command == .start_view or
|
|
3621
|
-
message.header.command == .headers or
|
|
3622
|
-
message.header.command == .recovery_response);
|
|
3623
|
-
|
|
3624
|
-
const headers = std.mem.bytesAsSlice(
|
|
3625
|
-
Header,
|
|
3626
|
-
message.buffer[@sizeOf(Header)..message.header.size],
|
|
3627
|
-
);
|
|
3628
|
-
|
|
3629
|
-
for (headers[0 .. headers.len - 1]) |header, index| {
|
|
3630
|
-
// Headers must be provided in reverse order for the sake of `repair_header()`.
|
|
3631
|
-
// Otherwise, headers may never be repaired where the hash chain never connects.
|
|
3632
|
-
assert(header.op > headers[index + 1].op);
|
|
3633
|
-
}
|
|
3634
|
-
|
|
3635
|
-
return headers;
|
|
3636
|
-
}
|
|
3637
|
-
|
|
3638
|
-
/// Returns whether the highest known op is certain.
|
|
3399
|
+
/// Returns whether the head op is certain.
|
|
3639
3400
|
///
|
|
3640
3401
|
/// After recovering the WAL, there are 2 possible outcomes:
|
|
3641
3402
|
/// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
|
|
@@ -3663,41 +3424,34 @@ pub fn ReplicaType(
|
|
|
3663
3424
|
/// * ` ✓ ✗ o `: View change is safe.
|
|
3664
3425
|
/// * ` ✓ = o `: View change is unsafe if any slots are faulty.
|
|
3665
3426
|
/// (`replica.op_checkpoint` == `replica.op`).
|
|
3666
|
-
|
|
3667
|
-
// If there is an "unsafe" fault, we will need to request a start_view from the primary to
|
|
3668
|
-
// learn the op.
|
|
3669
|
-
fn op_certain(self: *const Self) bool {
|
|
3427
|
+
fn op_head_certain(self: *const Self) bool {
|
|
3670
3428
|
assert(self.status == .recovering);
|
|
3671
|
-
assert(self.op_checkpoint <= self.op);
|
|
3429
|
+
assert(self.op_checkpoint() <= self.op);
|
|
3672
3430
|
|
|
3673
|
-
const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint)
|
|
3674
|
-
const
|
|
3431
|
+
const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint());
|
|
3432
|
+
const slot_op_head = self.journal.slot_with_op(self.op).?;
|
|
3675
3433
|
const slot_known_range = vsr.SlotRange{
|
|
3676
3434
|
.head = slot_op_checkpoint,
|
|
3677
|
-
.tail =
|
|
3435
|
+
.tail = slot_op_head,
|
|
3678
3436
|
};
|
|
3679
3437
|
|
|
3680
3438
|
var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
|
|
3681
3439
|
while (iterator.next()) |slot| {
|
|
3682
|
-
|
|
3683
|
-
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
!slot_known_range.contains(slot))
|
|
3687
|
-
{
|
|
3688
|
-
log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
|
|
3689
|
-
self.replica,
|
|
3690
|
-
slot.index,
|
|
3691
|
-
self.op,
|
|
3692
|
-
self.op_checkpoint,
|
|
3693
|
-
});
|
|
3694
|
-
return false;
|
|
3695
|
-
}
|
|
3440
|
+
if (slot_op_checkpoint.index == slot_op_head.index or
|
|
3441
|
+
!slot_known_range.contains(.{ .index = slot }))
|
|
3442
|
+
{
|
|
3443
|
+
return false;
|
|
3696
3444
|
}
|
|
3697
3445
|
}
|
|
3698
3446
|
return true;
|
|
3699
3447
|
}
|
|
3700
3448
|
|
|
3449
|
+
/// The op of the highest checkpointed message.
|
|
3450
|
+
// TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
|
|
3451
|
+
pub fn op_checkpoint(self: *const Self) u64 {
|
|
3452
|
+
return self.superblock.working.vsr_state.commit_min;
|
|
3453
|
+
}
|
|
3454
|
+
|
|
3701
3455
|
/// Returns the op that will be `op_checkpoint` after the next checkpoint.
|
|
3702
3456
|
///
|
|
3703
3457
|
/// For a replica with journal_slot_count=8 and lsm_batch_multiple=2:
|
|
@@ -3722,21 +3476,21 @@ pub fn ReplicaType(
|
|
|
3722
3476
|
/// % op_checkpoint_trigger
|
|
3723
3477
|
///
|
|
3724
3478
|
fn op_checkpoint_next(self: *const Self) u64 {
|
|
3725
|
-
assert(self.op_checkpoint <= self.commit_min);
|
|
3726
|
-
assert(self.op_checkpoint <= self.op);
|
|
3727
|
-
assert(self.op_checkpoint == 0 or
|
|
3728
|
-
(self.op_checkpoint + 1) % constants.lsm_batch_multiple == 0);
|
|
3479
|
+
assert(self.op_checkpoint() <= self.commit_min);
|
|
3480
|
+
assert(self.op_checkpoint() <= self.op);
|
|
3481
|
+
assert(self.op_checkpoint() == 0 or
|
|
3482
|
+
(self.op_checkpoint() + 1) % constants.lsm_batch_multiple == 0);
|
|
3729
3483
|
|
|
3730
|
-
const op = if (self.op_checkpoint == 0)
|
|
3484
|
+
const op = if (self.op_checkpoint() == 0)
|
|
3731
3485
|
// First wrap: op_checkpoint_next = 8-2-1 = 5
|
|
3732
3486
|
constants.journal_slot_count - constants.lsm_batch_multiple - 1
|
|
3733
3487
|
else
|
|
3734
3488
|
// Second wrap: op_checkpoint_next = 5+8-2 = 11
|
|
3735
3489
|
// Third wrap: op_checkpoint_next = 11+8-2 = 17
|
|
3736
|
-
self.op_checkpoint + constants.journal_slot_count - constants.lsm_batch_multiple;
|
|
3490
|
+
self.op_checkpoint() + constants.journal_slot_count - constants.lsm_batch_multiple;
|
|
3737
3491
|
assert((op + 1) % constants.lsm_batch_multiple == 0);
|
|
3738
3492
|
// The checkpoint always advances.
|
|
3739
|
-
assert(op > self.op_checkpoint);
|
|
3493
|
+
assert(op > self.op_checkpoint());
|
|
3740
3494
|
|
|
3741
3495
|
return op;
|
|
3742
3496
|
}
|
|
@@ -3790,110 +3544,94 @@ pub fn ReplicaType(
|
|
|
3790
3544
|
}
|
|
3791
3545
|
}
|
|
3792
3546
|
|
|
3793
|
-
|
|
3794
|
-
/// When `checksum` is `null`, match any checksum.
|
|
3795
|
-
fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
|
|
3796
|
-
assert(self.status == .normal or self.status == .view_change);
|
|
3797
|
-
|
|
3798
|
-
// To optimize the search, we can leverage the fact that the pipeline is ordered and
|
|
3799
|
-
// continuous.
|
|
3800
|
-
if (self.pipeline.count == 0) return null;
|
|
3801
|
-
const head_op = self.pipeline.head_ptr().?.message.header.op;
|
|
3802
|
-
const tail_op = self.pipeline.tail_ptr().?.message.header.op;
|
|
3803
|
-
if (op < head_op) return null;
|
|
3804
|
-
if (op > tail_op) return null;
|
|
3805
|
-
|
|
3806
|
-
const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
|
|
3807
|
-
assert(pipeline_prepare.message.header.op == op);
|
|
3808
|
-
|
|
3809
|
-
if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
|
|
3810
|
-
return pipeline_prepare;
|
|
3811
|
-
} else {
|
|
3812
|
-
return null;
|
|
3813
|
-
}
|
|
3814
|
-
}
|
|
3815
|
-
|
|
3816
|
-
/// Searches the pipeline for a prepare for a given client.
|
|
3817
|
-
fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
|
|
3547
|
+
fn primary_pipeline_prepare(self: *Self, request: Request) void {
|
|
3818
3548
|
assert(self.status == .normal);
|
|
3819
3549
|
assert(self.primary());
|
|
3820
3550
|
assert(self.commit_min == self.commit_max);
|
|
3551
|
+
assert(self.commit_max + self.pipeline.queue.prepare_queue.count == self.op);
|
|
3552
|
+
assert(!self.pipeline.queue.prepare_queue.full());
|
|
3553
|
+
self.pipeline.queue.verify();
|
|
3821
3554
|
|
|
3822
|
-
|
|
3823
|
-
|
|
3824
|
-
var iterator = self.pipeline.iterator_mutable();
|
|
3825
|
-
while (iterator.next_ptr()) |prepare| {
|
|
3826
|
-
assert(prepare.message.header.command == .prepare);
|
|
3827
|
-
assert(prepare.message.header.op == op);
|
|
3828
|
-
assert(prepare.message.header.parent == parent);
|
|
3829
|
-
|
|
3830
|
-
// A client may have multiple requests in the pipeline if these were committed by
|
|
3831
|
-
// the previous primary and were reloaded into the pipeline after a view change.
|
|
3832
|
-
if (prepare.message.header.client == client) return prepare;
|
|
3555
|
+
const message = request.message;
|
|
3556
|
+
assert(!self.ignore_request_message(message));
|
|
3833
3557
|
|
|
3834
|
-
|
|
3835
|
-
|
|
3836
|
-
|
|
3558
|
+
log.debug("{}: primary_pipeline_next: request checksum={} client={}", .{
|
|
3559
|
+
self.replica,
|
|
3560
|
+
message.header.checksum,
|
|
3561
|
+
message.header.client,
|
|
3562
|
+
});
|
|
3837
3563
|
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
3564
|
+
// Guard against the wall clock going backwards by taking the max with timestamps issued:
|
|
3565
|
+
self.state_machine.prepare_timestamp = std.math.max(
|
|
3566
|
+
// The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
|
|
3567
|
+
// may be our first prepare as a recently elected primary:
|
|
3568
|
+
std.math.max(
|
|
3569
|
+
self.state_machine.prepare_timestamp,
|
|
3570
|
+
self.state_machine.commit_timestamp,
|
|
3571
|
+
) + 1,
|
|
3572
|
+
@intCast(u64, request.realtime),
|
|
3573
|
+
);
|
|
3574
|
+
assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
|
|
3841
3575
|
|
|
3842
|
-
|
|
3843
|
-
|
|
3576
|
+
const prepare_timestamp = self.state_machine.prepare(
|
|
3577
|
+
message.header.operation.cast(StateMachine),
|
|
3578
|
+
message.body(),
|
|
3579
|
+
);
|
|
3844
3580
|
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3581
|
+
const latest_entry = self.journal.header_with_op(self.op).?;
|
|
3582
|
+
message.header.parent = latest_entry.checksum;
|
|
3583
|
+
message.header.context = message.header.checksum;
|
|
3584
|
+
message.header.view = self.view;
|
|
3585
|
+
message.header.op = self.op + 1;
|
|
3586
|
+
message.header.commit = self.commit_max;
|
|
3587
|
+
message.header.timestamp = prepare_timestamp;
|
|
3588
|
+
message.header.replica = self.replica;
|
|
3589
|
+
message.header.command = .prepare;
|
|
3850
3590
|
|
|
3851
|
-
|
|
3852
|
-
|
|
3591
|
+
message.header.set_checksum_body(message.body());
|
|
3592
|
+
message.header.set_checksum();
|
|
3853
3593
|
|
|
3854
|
-
|
|
3855
|
-
log.debug("{}: pipeline_prepare_for_prepare_ok: not preparing", .{self.replica});
|
|
3856
|
-
return null;
|
|
3857
|
-
};
|
|
3594
|
+
log.debug("{}: primary_pipeline_next: prepare {}", .{ self.replica, message.header.checksum });
|
|
3858
3595
|
|
|
3859
|
-
if (
|
|
3860
|
-
//
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3596
|
+
if (self.pipeline.queue.prepare_queue.tail_ptr()) |previous| {
|
|
3597
|
+
// Do not restart the prepare timeout as it is already ticking for another prepare.
|
|
3598
|
+
assert(self.prepare_timeout.ticking);
|
|
3599
|
+
assert(previous.message.header.checksum == message.header.parent);
|
|
3600
|
+
} else {
|
|
3601
|
+
// We are about to add the first prepare to the pipeline, so start the timeout.
|
|
3602
|
+
assert(!self.prepare_timeout.ticking);
|
|
3603
|
+
self.prepare_timeout.start();
|
|
3865
3604
|
}
|
|
3605
|
+
self.pipeline.queue.push_prepare(message);
|
|
3606
|
+
self.on_prepare(message);
|
|
3866
3607
|
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
assert(
|
|
3870
|
-
assert(prepare.message.header.cluster == ok.header.cluster);
|
|
3871
|
-
assert(prepare.message.header.epoch == ok.header.epoch);
|
|
3872
|
-
// A prepare may be committed in the same view or in a newer view:
|
|
3873
|
-
assert(prepare.message.header.view <= ok.header.view);
|
|
3874
|
-
assert(prepare.message.header.op == ok.header.op);
|
|
3875
|
-
assert(prepare.message.header.commit == ok.header.commit);
|
|
3876
|
-
assert(prepare.message.header.timestamp == ok.header.timestamp);
|
|
3877
|
-
assert(prepare.message.header.operation == ok.header.operation);
|
|
3878
|
-
|
|
3879
|
-
return prepare;
|
|
3608
|
+
// We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
|
|
3609
|
+
// This is critical to ensure that pipelined prepares do not receive the same op number.
|
|
3610
|
+
assert(self.op == message.header.op);
|
|
3880
3611
|
}
|
|
3881
3612
|
|
|
3882
|
-
fn
|
|
3883
|
-
assert(self.status == .
|
|
3884
|
-
assert(self.
|
|
3885
|
-
|
|
3886
|
-
log.debug("{}: recover: sending recovery messages nonce={}", .{
|
|
3887
|
-
self.replica,
|
|
3888
|
-
self.recovery_nonce,
|
|
3889
|
-
});
|
|
3613
|
+
fn pipeline_prepare_by_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Message {
|
|
3614
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
3615
|
+
assert(self.replica == self.primary_index(self.view) or checksum != null);
|
|
3890
3616
|
|
|
3891
|
-
|
|
3892
|
-
|
|
3893
|
-
|
|
3894
|
-
|
|
3895
|
-
|
|
3896
|
-
|
|
3617
|
+
if (checksum == null) {
|
|
3618
|
+
// The PipelineCache may hold messages that have been discarded, so we must be
|
|
3619
|
+
// careful not to access it unless we can verify the entry's checksum.
|
|
3620
|
+
//
|
|
3621
|
+
// Only on_request_prepare() queries the pipeline with checksum=null.
|
|
3622
|
+
// And primaries ignore request_prepare messages during their view change
|
|
3623
|
+
// (during which time the pipeline is not yet repaired, and so is untrusted).
|
|
3624
|
+
assert(self.primary());
|
|
3625
|
+
assert(self.pipeline == .queue);
|
|
3626
|
+
}
|
|
3627
|
+
|
|
3628
|
+
return switch (self.pipeline) {
|
|
3629
|
+
.cache => |*cache| cache.prepare_by_op_and_checksum(op, checksum.?),
|
|
3630
|
+
.queue => |*queue| if (queue.prepare_by_op_and_checksum(op, checksum)) |prepare|
|
|
3631
|
+
prepare.message
|
|
3632
|
+
else
|
|
3633
|
+
null,
|
|
3634
|
+
};
|
|
3897
3635
|
}
|
|
3898
3636
|
|
|
3899
3637
|
/// Starting from the latest journal entry, backfill any missing or disconnected headers.
|
|
@@ -3911,8 +3649,8 @@ pub fn ReplicaType(
|
|
|
3911
3649
|
assert(self.status == .normal or self.status == .view_change);
|
|
3912
3650
|
assert(self.repairs_allowed());
|
|
3913
3651
|
|
|
3914
|
-
assert(self.op_checkpoint <= self.op);
|
|
3915
|
-
assert(self.op_checkpoint <= self.commit_min);
|
|
3652
|
+
assert(self.op_checkpoint() <= self.op);
|
|
3653
|
+
assert(self.op_checkpoint() <= self.commit_min);
|
|
3916
3654
|
assert(self.commit_min <= self.op);
|
|
3917
3655
|
assert(self.commit_min <= self.commit_max);
|
|
3918
3656
|
assert(self.journal.header_with_op(self.op) != null);
|
|
@@ -3954,36 +3692,43 @@ pub fn ReplicaType(
|
|
|
3954
3692
|
}
|
|
3955
3693
|
|
|
3956
3694
|
// Request any missing or disconnected headers:
|
|
3957
|
-
|
|
3958
|
-
|
|
3959
|
-
|
|
3960
|
-
log.debug("{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})", .{
|
|
3961
|
-
self.replica,
|
|
3962
|
-
self.view,
|
|
3963
|
-
range.op_min,
|
|
3964
|
-
range.op_max,
|
|
3965
|
-
self.commit_min,
|
|
3966
|
-
self.commit_max,
|
|
3695
|
+
if (self.commit_min != self.op) {
|
|
3696
|
+
var broken = self.journal.find_latest_headers_break_between(
|
|
3697
|
+
self.commit_min + 1,
|
|
3967
3698
|
self.op,
|
|
3968
|
-
|
|
3969
|
-
|
|
3970
|
-
|
|
3971
|
-
|
|
3972
|
-
|
|
3973
|
-
|
|
3974
|
-
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
|
|
3981
|
-
|
|
3982
|
-
|
|
3983
|
-
|
|
3984
|
-
|
|
3699
|
+
);
|
|
3700
|
+
if (broken) |range| {
|
|
3701
|
+
log.debug(
|
|
3702
|
+
"{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})",
|
|
3703
|
+
.{
|
|
3704
|
+
self.replica,
|
|
3705
|
+
self.view,
|
|
3706
|
+
range.op_min,
|
|
3707
|
+
range.op_max,
|
|
3708
|
+
self.commit_min,
|
|
3709
|
+
self.commit_max,
|
|
3710
|
+
self.op,
|
|
3711
|
+
},
|
|
3712
|
+
);
|
|
3713
|
+
assert(range.op_min > self.commit_min);
|
|
3714
|
+
assert(range.op_max < self.op);
|
|
3715
|
+
// A range of `op_min=0` or `op_max=0` should be impossible as a header break:
|
|
3716
|
+
// This is the root op that is prepared when the cluster is initialized.
|
|
3717
|
+
assert(range.op_min > 0);
|
|
3718
|
+
assert(range.op_max > 0);
|
|
3719
|
+
|
|
3720
|
+
if (self.choose_any_other_replica()) |replica| {
|
|
3721
|
+
self.send_header_to_replica(replica, .{
|
|
3722
|
+
.command = .request_headers,
|
|
3723
|
+
.cluster = self.cluster,
|
|
3724
|
+
.replica = self.replica,
|
|
3725
|
+
.view = self.view,
|
|
3726
|
+
.commit = range.op_min,
|
|
3727
|
+
.op = range.op_max,
|
|
3728
|
+
});
|
|
3729
|
+
}
|
|
3730
|
+
return;
|
|
3985
3731
|
}
|
|
3986
|
-
return;
|
|
3987
3732
|
}
|
|
3988
3733
|
|
|
3989
3734
|
// Assert that all headers are now present and connected with a perfect hash chain:
|
|
@@ -4003,9 +3748,12 @@ pub fn ReplicaType(
|
|
|
4003
3748
|
}
|
|
4004
3749
|
|
|
4005
3750
|
if (self.status == .view_change and self.primary_index(self.view) == self.replica) {
|
|
4006
|
-
|
|
4007
|
-
|
|
4008
|
-
|
|
3751
|
+
// Repair the pipeline, which may discover faulty prepares and drive more repairs.
|
|
3752
|
+
switch (self.primary_repair_pipeline()) {
|
|
3753
|
+
// primary_repair_pipeline() is already working.
|
|
3754
|
+
.busy => {},
|
|
3755
|
+
.done => self.primary_start_view_as_the_new_primary(),
|
|
3756
|
+
}
|
|
4009
3757
|
}
|
|
4010
3758
|
}
|
|
4011
3759
|
|
|
@@ -4073,8 +3821,8 @@ pub fn ReplicaType(
|
|
|
4073
3821
|
return false;
|
|
4074
3822
|
}
|
|
4075
3823
|
|
|
4076
|
-
if (header.op <= self.op_checkpoint) {
|
|
4077
|
-
if (header.op == 0 and self.op_checkpoint == 0) {
|
|
3824
|
+
if (header.op <= self.op_checkpoint()) {
|
|
3825
|
+
if (header.op == 0 and self.op_checkpoint() == 0) {
|
|
4078
3826
|
// Repairing the root op is allowed until the first checkpoint.
|
|
4079
3827
|
} else {
|
|
4080
3828
|
// It is critical that we do not repair checkpointed ops; their slots now belong
|
|
@@ -4082,7 +3830,7 @@ pub fn ReplicaType(
|
|
|
4082
3830
|
// correctness violation.
|
|
4083
3831
|
log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
|
|
4084
3832
|
self.replica,
|
|
4085
|
-
self.op_checkpoint,
|
|
3833
|
+
self.op_checkpoint(),
|
|
4086
3834
|
});
|
|
4087
3835
|
return false;
|
|
4088
3836
|
}
|
|
@@ -4200,175 +3948,173 @@ pub fn ReplicaType(
|
|
|
4200
3948
|
}
|
|
4201
3949
|
|
|
4202
3950
|
/// Reads prepares into the pipeline (before we start the view as the new primary).
|
|
4203
|
-
fn primary_repair_pipeline(self: *Self)
|
|
3951
|
+
fn primary_repair_pipeline(self: *Self) enum { done, busy } {
|
|
4204
3952
|
assert(self.status == .view_change);
|
|
4205
3953
|
assert(self.primary_index(self.view) == self.replica);
|
|
4206
|
-
assert(self.commit_max
|
|
3954
|
+
assert(self.commit_max == self.commit_min);
|
|
3955
|
+
assert(self.commit_max <= self.op);
|
|
4207
3956
|
assert(self.journal.dirty.count == 0);
|
|
3957
|
+
assert(self.pipeline == .cache);
|
|
4208
3958
|
|
|
4209
|
-
if (self.
|
|
3959
|
+
if (self.pipeline_repairing) {
|
|
4210
3960
|
log.debug("{}: primary_repair_pipeline: already repairing...", .{self.replica});
|
|
4211
|
-
return;
|
|
3961
|
+
return .busy;
|
|
4212
3962
|
}
|
|
4213
3963
|
|
|
4214
|
-
|
|
4215
|
-
|
|
4216
|
-
|
|
4217
|
-
|
|
3964
|
+
if (self.primary_repair_pipeline_op()) |_| {
|
|
3965
|
+
log.debug("{}: primary_repair_pipeline: repairing", .{self.replica});
|
|
3966
|
+
assert(!self.pipeline_repairing);
|
|
3967
|
+
self.pipeline_repairing = true;
|
|
3968
|
+
self.primary_repair_pipeline_read();
|
|
3969
|
+
return .busy;
|
|
3970
|
+
}
|
|
4218
3971
|
|
|
4219
|
-
|
|
3972
|
+
// All prepares needed to reconstruct the pipeline queue are now available in the cache.
|
|
3973
|
+
return .done;
|
|
4220
3974
|
}
|
|
4221
3975
|
|
|
4222
|
-
|
|
4223
|
-
/// Retain uncommitted messages that belong in the current view to maximize durability.
|
|
4224
|
-
fn primary_repair_pipeline_diff(self: *Self) void {
|
|
3976
|
+
fn primary_repair_pipeline_done(self: *Self) PipelineQueue {
|
|
4225
3977
|
assert(self.status == .view_change);
|
|
4226
3978
|
assert(self.primary_index(self.view) == self.replica);
|
|
3979
|
+
assert(self.commit_max == self.commit_min);
|
|
3980
|
+
assert(self.commit_max <= self.op);
|
|
3981
|
+
assert(self.journal.dirty.count == 0);
|
|
3982
|
+
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
3983
|
+
assert(self.pipeline == .cache);
|
|
3984
|
+
assert(!self.pipeline_repairing);
|
|
3985
|
+
assert(self.primary_repair_pipeline() == .done);
|
|
3986
|
+
assert(self.commit_max + constants.pipeline_prepare_queue_max >= self.op);
|
|
4227
3987
|
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4232
|
-
self.
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
// Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
|
|
4236
|
-
if (self.pipeline.head_ptr()) |pipeline_head| {
|
|
4237
|
-
const parent = self.journal.header_with_op_and_checksum(
|
|
4238
|
-
pipeline_head.message.header.op - 1,
|
|
4239
|
-
pipeline_head.message.header.parent,
|
|
4240
|
-
);
|
|
4241
|
-
if (parent == null) {
|
|
4242
|
-
while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
|
|
4243
|
-
assert(self.pipeline.count == 0);
|
|
4244
|
-
}
|
|
4245
|
-
}
|
|
3988
|
+
var pipeline_queue = PipelineQueue{};
|
|
3989
|
+
var op = self.commit_max + 1;
|
|
3990
|
+
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
3991
|
+
while (op <= self.op) : (op += 1) {
|
|
3992
|
+
const journal_header = self.journal.header_with_op(op).?;
|
|
3993
|
+
assert(journal_header.op == op);
|
|
3994
|
+
assert(journal_header.parent == parent);
|
|
4246
3995
|
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
3996
|
+
const prepare =
|
|
3997
|
+
self.pipeline.cache.prepare_by_op_and_checksum(op, journal_header.checksum).?;
|
|
3998
|
+
assert(prepare.header.op == op);
|
|
3999
|
+
assert(prepare.header.op <= self.op);
|
|
4000
|
+
assert(prepare.header.checksum == journal_header.checksum);
|
|
4001
|
+
assert(prepare.header.parent == parent);
|
|
4002
|
+
assert(self.journal.has(prepare.header));
|
|
4250
4003
|
|
|
4251
|
-
|
|
4004
|
+
pipeline_queue.push_prepare(prepare.ref());
|
|
4005
|
+
parent = prepare.header.checksum;
|
|
4252
4006
|
}
|
|
4007
|
+
assert(self.commit_max + pipeline_queue.prepare_queue.count == self.op);
|
|
4253
4008
|
|
|
4254
|
-
|
|
4255
|
-
|
|
4256
|
-
self.pipeline.count,
|
|
4257
|
-
});
|
|
4258
|
-
|
|
4259
|
-
self.verify_pipeline();
|
|
4260
|
-
|
|
4261
|
-
// Do not reset `repairing_pipeline` here as this must be reset by the read callback.
|
|
4262
|
-
// Otherwise, we would be making `primary_repair_pipeline()` reentrant.
|
|
4009
|
+
pipeline_queue.verify();
|
|
4010
|
+
return pipeline_queue;
|
|
4263
4011
|
}
|
|
4264
4012
|
|
|
4265
4013
|
/// Returns the next `op` number that needs to be read into the pipeline.
|
|
4266
|
-
|
|
4014
|
+
/// Returns null when all necessary prepares are in the pipeline cache.
|
|
4015
|
+
fn primary_repair_pipeline_op(self: *const Self) ?u64 {
|
|
4267
4016
|
assert(self.status == .view_change);
|
|
4268
4017
|
assert(self.primary_index(self.view) == self.replica);
|
|
4018
|
+
assert(self.commit_max == self.commit_min);
|
|
4019
|
+
assert(self.commit_max <= self.op);
|
|
4020
|
+
assert(self.pipeline == .cache);
|
|
4269
4021
|
|
|
4270
|
-
|
|
4271
|
-
self.
|
|
4272
|
-
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
|
|
4276
|
-
|
|
4022
|
+
var op = self.commit_max + 1;
|
|
4023
|
+
while (op <= self.op) : (op += 1) {
|
|
4024
|
+
const op_header = self.journal.header_with_op(op).?;
|
|
4025
|
+
if (!self.pipeline.cache.contains_header(op_header)) {
|
|
4026
|
+
return op;
|
|
4027
|
+
}
|
|
4028
|
+
}
|
|
4277
4029
|
return null;
|
|
4278
4030
|
}
|
|
4279
4031
|
|
|
4280
4032
|
fn primary_repair_pipeline_read(self: *Self) void {
|
|
4281
|
-
assert(self.repairing_pipeline);
|
|
4282
4033
|
assert(self.status == .view_change);
|
|
4283
4034
|
assert(self.primary_index(self.view) == self.replica);
|
|
4035
|
+
assert(self.commit_max == self.commit_min);
|
|
4036
|
+
assert(self.commit_max <= self.op);
|
|
4037
|
+
assert(self.pipeline == .cache);
|
|
4038
|
+
assert(self.pipeline_repairing);
|
|
4284
4039
|
|
|
4285
|
-
|
|
4286
|
-
|
|
4287
|
-
|
|
4288
|
-
|
|
4289
|
-
|
|
4290
|
-
|
|
4291
|
-
|
|
4292
|
-
|
|
4293
|
-
self.replica,
|
|
4294
|
-
op,
|
|
4295
|
-
checksum,
|
|
4296
|
-
});
|
|
4297
|
-
|
|
4298
|
-
self.journal.read_prepare(repair_pipeline_push, op, checksum, null);
|
|
4299
|
-
} else {
|
|
4300
|
-
log.debug("{}: primary_repair_pipeline_read: repaired", .{self.replica});
|
|
4301
|
-
self.repairing_pipeline = false;
|
|
4302
|
-
self.repair();
|
|
4303
|
-
}
|
|
4040
|
+
const op = self.primary_repair_pipeline_op().?;
|
|
4041
|
+
const op_checksum = self.journal.header_with_op(op).?.checksum;
|
|
4042
|
+
log.debug("{}: primary_repair_pipeline_read: op={} checksum={}", .{
|
|
4043
|
+
self.replica,
|
|
4044
|
+
op,
|
|
4045
|
+
op_checksum,
|
|
4046
|
+
});
|
|
4047
|
+
self.journal.read_prepare(repair_pipeline_read_callback, op, op_checksum, null);
|
|
4304
4048
|
}
|
|
4305
4049
|
|
|
4306
|
-
fn
|
|
4050
|
+
fn repair_pipeline_read_callback(
|
|
4307
4051
|
self: *Self,
|
|
4308
4052
|
prepare: ?*Message,
|
|
4309
4053
|
destination_replica: ?u8,
|
|
4310
4054
|
) void {
|
|
4311
4055
|
assert(destination_replica == null);
|
|
4312
4056
|
|
|
4313
|
-
assert(self.
|
|
4314
|
-
self.
|
|
4057
|
+
assert(self.pipeline_repairing);
|
|
4058
|
+
self.pipeline_repairing = false;
|
|
4315
4059
|
|
|
4316
4060
|
if (prepare == null) {
|
|
4317
|
-
log.debug("{}:
|
|
4061
|
+
log.debug("{}: repair_pipeline_read_callback: prepare == null", .{self.replica});
|
|
4318
4062
|
return;
|
|
4319
4063
|
}
|
|
4320
4064
|
|
|
4321
4065
|
// Our state may have advanced significantly while we were reading from disk.
|
|
4322
4066
|
if (self.status != .view_change) {
|
|
4323
|
-
|
|
4067
|
+
assert(self.primary_index(self.view) != self.replica);
|
|
4068
|
+
|
|
4069
|
+
log.debug("{}: repair_pipeline_read_callback: no longer in view change status", .{
|
|
4324
4070
|
self.replica,
|
|
4325
4071
|
});
|
|
4326
4072
|
return;
|
|
4327
4073
|
}
|
|
4328
4074
|
|
|
4329
4075
|
if (self.primary_index(self.view) != self.replica) {
|
|
4330
|
-
log.debug("{}:
|
|
4076
|
+
log.debug("{}: repair_pipeline_read_callback: no longer primary", .{self.replica});
|
|
4331
4077
|
return;
|
|
4332
4078
|
}
|
|
4333
4079
|
|
|
4334
4080
|
// We may even be several views ahead and may now have a completely different pipeline.
|
|
4335
4081
|
const op = self.primary_repair_pipeline_op() orelse {
|
|
4336
|
-
log.debug("{}:
|
|
4082
|
+
log.debug("{}: repair_pipeline_read_callback: pipeline changed", .{self.replica});
|
|
4337
4083
|
return;
|
|
4338
4084
|
};
|
|
4339
4085
|
|
|
4340
4086
|
assert(op > self.commit_max);
|
|
4341
4087
|
assert(op <= self.op);
|
|
4342
|
-
assert(self.commit_max + self.pipeline.count + 1 == op);
|
|
4343
4088
|
|
|
4344
4089
|
if (prepare.?.header.op != op) {
|
|
4345
|
-
log.debug("{}:
|
|
4090
|
+
log.debug("{}: repair_pipeline_read_callback: op changed", .{self.replica});
|
|
4346
4091
|
return;
|
|
4347
4092
|
}
|
|
4348
4093
|
|
|
4349
4094
|
if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
|
|
4350
|
-
log.debug("{}:
|
|
4095
|
+
log.debug("{}: repair_pipeline_read_callback: checksum changed", .{self.replica});
|
|
4351
4096
|
return;
|
|
4352
4097
|
}
|
|
4353
4098
|
|
|
4354
4099
|
assert(self.status == .view_change);
|
|
4355
4100
|
assert(self.primary_index(self.view) == self.replica);
|
|
4356
4101
|
|
|
4357
|
-
log.debug("{}:
|
|
4102
|
+
log.debug("{}: repair_pipeline_read_callback: op={} checksum={}", .{
|
|
4358
4103
|
self.replica,
|
|
4359
4104
|
prepare.?.header.op,
|
|
4360
4105
|
prepare.?.header.checksum,
|
|
4361
4106
|
});
|
|
4362
4107
|
|
|
4363
|
-
|
|
4364
|
-
|
|
4365
|
-
}
|
|
4366
|
-
|
|
4367
|
-
self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
|
|
4368
|
-
assert(self.pipeline.count >= 1);
|
|
4108
|
+
const prepare_evicted = self.pipeline.cache.insert(prepare.?.ref());
|
|
4109
|
+
if (prepare_evicted) |message_evicted| self.message_bus.unref(message_evicted);
|
|
4369
4110
|
|
|
4370
|
-
self.
|
|
4371
|
-
|
|
4111
|
+
if (self.primary_repair_pipeline_op()) |_| {
|
|
4112
|
+
assert(!self.pipeline_repairing);
|
|
4113
|
+
self.pipeline_repairing = true;
|
|
4114
|
+
self.primary_repair_pipeline_read();
|
|
4115
|
+
} else {
|
|
4116
|
+
self.repair();
|
|
4117
|
+
}
|
|
4372
4118
|
}
|
|
4373
4119
|
|
|
4374
4120
|
fn repair_prepares(self: *Self) void {
|
|
@@ -4376,7 +4122,7 @@ pub fn ReplicaType(
|
|
|
4376
4122
|
assert(self.repairs_allowed());
|
|
4377
4123
|
assert(self.journal.dirty.count > 0);
|
|
4378
4124
|
assert(self.op >= self.commit_min);
|
|
4379
|
-
assert(self.op - self.commit_min
|
|
4125
|
+
assert(self.op - self.commit_min <= constants.journal_slot_count);
|
|
4380
4126
|
|
|
4381
4127
|
// Request enough prepares to utilize our max IO depth:
|
|
4382
4128
|
var budget = self.journal.writes.available();
|
|
@@ -4434,7 +4180,7 @@ pub fn ReplicaType(
|
|
|
4434
4180
|
// belong) to a newer op, from the new WAL wrap. Additionally, we may not
|
|
4435
4181
|
// still have access to its surrounding commits to verify the hash chain.
|
|
4436
4182
|
assert(op <= self.commit_min);
|
|
4437
|
-
assert(op <= self.op_checkpoint);
|
|
4183
|
+
assert(op <= self.op_checkpoint());
|
|
4438
4184
|
assert(self.journal.faulty.bit(slot));
|
|
4439
4185
|
|
|
4440
4186
|
log.debug("{}: repair_prepares: remove slot={} " ++
|
|
@@ -4516,9 +4262,9 @@ pub fn ReplicaType(
|
|
|
4516
4262
|
//
|
|
4517
4263
|
// Using the pipeline to repair is faster than a `request_prepare`.
|
|
4518
4264
|
// Also, messages in the pipeline are never corrupt.
|
|
4519
|
-
if (self.
|
|
4520
|
-
assert(prepare.
|
|
4521
|
-
assert(prepare.
|
|
4265
|
+
if (self.pipeline_prepare_by_op_and_checksum(op, checksum)) |prepare| {
|
|
4266
|
+
assert(prepare.header.op == op);
|
|
4267
|
+
assert(prepare.header.checksum == checksum);
|
|
4522
4268
|
|
|
4523
4269
|
if (self.replica_count == 1) {
|
|
4524
4270
|
// This op won't start writing until all ops in the pipeline preceding it have
|
|
@@ -4528,7 +4274,8 @@ pub fn ReplicaType(
|
|
|
4528
4274
|
op,
|
|
4529
4275
|
checksum,
|
|
4530
4276
|
});
|
|
4531
|
-
|
|
4277
|
+
const pipeline_head = self.pipeline.queue.prepare_queue.head_ptr().?;
|
|
4278
|
+
assert(pipeline_head.message.header.op < op);
|
|
4532
4279
|
return false;
|
|
4533
4280
|
}
|
|
4534
4281
|
|
|
@@ -4537,7 +4284,7 @@ pub fn ReplicaType(
|
|
|
4537
4284
|
op,
|
|
4538
4285
|
checksum,
|
|
4539
4286
|
});
|
|
4540
|
-
self.write_prepare(prepare
|
|
4287
|
+
self.write_prepare(prepare, .pipeline);
|
|
4541
4288
|
return true;
|
|
4542
4289
|
}
|
|
4543
4290
|
|
|
@@ -4638,29 +4385,10 @@ pub fn ReplicaType(
|
|
|
4638
4385
|
}
|
|
4639
4386
|
}
|
|
4640
4387
|
|
|
4641
|
-
/// The caller must ensure that the headers are trustworthy.
|
|
4642
|
-
///
|
|
4643
|
-
/// Asserts that sequential ops are hash-chained. (Gaps are permitted).
|
|
4644
|
-
fn replace_headers(self: *Self, headers: []const Header) void {
|
|
4645
|
-
for (headers) |*header, i| {
|
|
4646
|
-
if (i > 0) {
|
|
4647
|
-
const next = &headers[i - 1];
|
|
4648
|
-
assert(next.view >= header.view);
|
|
4649
|
-
if (next.op == header.op + 1) {
|
|
4650
|
-
assert(next.parent == header.checksum);
|
|
4651
|
-
} else {
|
|
4652
|
-
assert(next.op > header.op);
|
|
4653
|
-
}
|
|
4654
|
-
}
|
|
4655
|
-
|
|
4656
|
-
self.replace_header(header);
|
|
4657
|
-
}
|
|
4658
|
-
}
|
|
4659
|
-
|
|
4660
4388
|
/// Replaces the header if the header is different and not already committed.
|
|
4661
4389
|
/// The caller must ensure that the header is trustworthy.
|
|
4662
4390
|
fn replace_header(self: *Self, header: *const Header) void {
|
|
4663
|
-
assert(self.op_checkpoint <= self.commit_min);
|
|
4391
|
+
assert(self.op_checkpoint() <= self.commit_min);
|
|
4664
4392
|
assert(header.command == .prepare);
|
|
4665
4393
|
assert(header.op <= self.op); // Never advance the op.
|
|
4666
4394
|
assert(header.op <= self.op_checkpoint_trigger());
|
|
@@ -4670,7 +4398,7 @@ pub fn ReplicaType(
|
|
|
4670
4398
|
assert(existing_header.checksum == header.checksum);
|
|
4671
4399
|
return;
|
|
4672
4400
|
} else {
|
|
4673
|
-
if (header.op <= self.op_checkpoint) {
|
|
4401
|
+
if (header.op <= self.op_checkpoint()) {
|
|
4674
4402
|
// Never replace a checkpointed op — those slots are needed by the following
|
|
4675
4403
|
// WAL wrap.
|
|
4676
4404
|
return;
|
|
@@ -4769,35 +4497,11 @@ pub fn ReplicaType(
|
|
|
4769
4497
|
self.nack_prepare_op = null;
|
|
4770
4498
|
}
|
|
4771
4499
|
|
|
4772
|
-
fn reset_quorum_prepare_ok(self: *Self) void {
|
|
4773
|
-
// "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
|
|
4774
|
-
// after a cycle of view changes. In other words, when a view change cycles around, so
|
|
4775
|
-
// that the original primary becomes a primary of a new view, pipeline entries may be
|
|
4776
|
-
// reused. However, the pipeline's prepare_ok quorums must not be reused, since the
|
|
4777
|
-
// replicas that sent them may have swapped them out during a previous view change.
|
|
4778
|
-
var iterator = self.pipeline.iterator_mutable();
|
|
4779
|
-
while (iterator.next_ptr()) |prepare| {
|
|
4780
|
-
prepare.ok_quorum_received = false;
|
|
4781
|
-
prepare.ok_from_all_replicas = quorum_counter_null;
|
|
4782
|
-
assert(prepare.ok_from_all_replicas.count() == 0);
|
|
4783
|
-
}
|
|
4784
|
-
}
|
|
4785
|
-
|
|
4786
4500
|
fn reset_quorum_start_view_change(self: *Self) void {
|
|
4787
4501
|
self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
|
|
4788
4502
|
self.start_view_change_quorum = false;
|
|
4789
4503
|
}
|
|
4790
4504
|
|
|
4791
|
-
fn reset_quorum_recovery_response(self: *Self) void {
|
|
4792
|
-
for (self.recovery_response_from_other_replicas) |*received, replica| {
|
|
4793
|
-
if (received.*) |message| {
|
|
4794
|
-
assert(replica != self.replica);
|
|
4795
|
-
self.message_bus.unref(message);
|
|
4796
|
-
received.* = null;
|
|
4797
|
-
}
|
|
4798
|
-
}
|
|
4799
|
-
}
|
|
4800
|
-
|
|
4801
4505
|
fn send_prepare_ok(self: *Self, header: *const Header) void {
|
|
4802
4506
|
assert(header.command == .prepare);
|
|
4803
4507
|
assert(header.cluster == self.cluster);
|
|
@@ -4920,6 +4624,7 @@ pub fn ReplicaType(
|
|
|
4920
4624
|
// operations after the highest `commit_min` may yet have been committed before the old
|
|
4921
4625
|
// primary crashed. The new primary will use the NACK protocol to be sure of a discard.
|
|
4922
4626
|
assert(message.header.commit == self.commit_min);
|
|
4627
|
+
DVCQuorum.verify_message(message);
|
|
4923
4628
|
|
|
4924
4629
|
self.send_message_to_replica(self.primary_index(self.view), message);
|
|
4925
4630
|
}
|
|
@@ -5051,18 +4756,6 @@ pub fn ReplicaType(
|
|
|
5051
4756
|
},
|
|
5052
4757
|
else => unreachable,
|
|
5053
4758
|
},
|
|
5054
|
-
.recovery => {
|
|
5055
|
-
assert(self.status == .recovering);
|
|
5056
|
-
assert(message.header.replica == self.replica);
|
|
5057
|
-
assert(message.header.replica != replica);
|
|
5058
|
-
assert(message.header.context == self.recovery_nonce);
|
|
5059
|
-
},
|
|
5060
|
-
.recovery_response => {
|
|
5061
|
-
assert(self.status == .normal);
|
|
5062
|
-
assert(message.header.view == self.view);
|
|
5063
|
-
assert(message.header.replica == self.replica);
|
|
5064
|
-
assert(message.header.replica != replica);
|
|
5065
|
-
},
|
|
5066
4759
|
.headers => {
|
|
5067
4760
|
assert(self.status == .normal or self.status == .view_change);
|
|
5068
4761
|
assert(message.header.view == self.view);
|
|
@@ -5111,6 +4804,42 @@ pub fn ReplicaType(
|
|
|
5111
4804
|
},
|
|
5112
4805
|
}
|
|
5113
4806
|
|
|
4807
|
+
if (replica != self.replica) {
|
|
4808
|
+
// Critical: Do not advertise a view/log_view before it is durable.
|
|
4809
|
+
// See view_durable()/log_view_durable().
|
|
4810
|
+
if (message.header.view > self.view_durable() and
|
|
4811
|
+
message.header.command != .request_start_view)
|
|
4812
|
+
{
|
|
4813
|
+
log.debug("{}: send_message_to_replica: dropped {s} " ++
|
|
4814
|
+
"(view_durable={} message.view={})", .{
|
|
4815
|
+
self.replica,
|
|
4816
|
+
@tagName(message.header.command),
|
|
4817
|
+
self.view_durable(),
|
|
4818
|
+
message.header.view,
|
|
4819
|
+
});
|
|
4820
|
+
return;
|
|
4821
|
+
}
|
|
4822
|
+
|
|
4823
|
+
if (message.header.command == .do_view_change) {
|
|
4824
|
+
const message_log_view = message.header.timestamp;
|
|
4825
|
+
if (self.log_view_durable() < message_log_view) {
|
|
4826
|
+
log.debug("{}: send_message_to_replica: dropped {s} " ++
|
|
4827
|
+
"(log_view_durable={} message.log_view={})", .{
|
|
4828
|
+
self.replica,
|
|
4829
|
+
@tagName(message.header.command),
|
|
4830
|
+
self.log_view_durable(),
|
|
4831
|
+
message_log_view,
|
|
4832
|
+
});
|
|
4833
|
+
return;
|
|
4834
|
+
}
|
|
4835
|
+
assert(std.mem.eql(
|
|
4836
|
+
u8,
|
|
4837
|
+
message.body(),
|
|
4838
|
+
std.mem.sliceAsBytes(self.superblock.working.vsr_headers().slice),
|
|
4839
|
+
));
|
|
4840
|
+
}
|
|
4841
|
+
}
|
|
4842
|
+
|
|
5114
4843
|
if (replica == self.replica) {
|
|
5115
4844
|
assert(self.loopback_queue == null);
|
|
5116
4845
|
self.loopback_queue = message.ref();
|
|
@@ -5119,6 +4848,142 @@ pub fn ReplicaType(
|
|
|
5119
4848
|
}
|
|
5120
4849
|
}
|
|
5121
4850
|
|
|
4851
|
+
/// The highest durable view.
|
|
4852
|
+
/// A replica must not advertise a view higher than its durable view.
|
|
4853
|
+
///
|
|
4854
|
+
/// The advertised `view` must never backtrack after a crash.
|
|
4855
|
+
/// This ensures the old primary is isolated — if a backup's view backtracks, it could
|
|
4856
|
+
/// ack a prepare to the old primary, forking the log. See VRR §8.2 for more detail.
|
|
4857
|
+
///
|
|
4858
|
+
/// Equivalent to `superblock.working.vsr_state.view`.
|
|
4859
|
+
fn view_durable(self: *const Self) u32 {
|
|
4860
|
+
return self.superblock.working.vsr_state.view;
|
|
4861
|
+
}
|
|
4862
|
+
|
|
4863
|
+
/// The highest durable log_view.
|
|
4864
|
+
/// A replica must not advertise a log_view (in a DVC) higher than its durable log_view.
|
|
4865
|
+
///
|
|
4866
|
+
/// A replica's advertised `log_view` must never backtrack after a crash.
|
|
4867
|
+
/// (`log_view` is only advertised within DVC messages).
|
|
4868
|
+
///
|
|
4869
|
+
/// To understand why, consider the following replica logs, where:
|
|
4870
|
+
///
|
|
4871
|
+
/// - numbers in replica rows denote the version of the op, and
|
|
4872
|
+
/// - a<b<c denotes the view in which the op was prepared.
|
|
4873
|
+
///
|
|
4874
|
+
/// Replica 0 prepares some ops, but they never arrive at replica 1/2:
|
|
4875
|
+
///
|
|
4876
|
+
/// view=a
|
|
4877
|
+
/// op │ 0 1 2
|
|
4878
|
+
/// replica 0 │ 1a 2a 3a (log_view=a, leader)
|
|
4879
|
+
/// replica 1 │ - - - (log_view=a, follower — but never receives any prepares)
|
|
4880
|
+
/// (replica 2) │ - - - (log_view=_, partitioned)
|
|
4881
|
+
///
|
|
4882
|
+
/// After a view change, replica 1 prepares some ops, but they never arrive at replica 0/2:
|
|
4883
|
+
///
|
|
4884
|
+
/// view=b
|
|
4885
|
+
/// op │ 0 1 2
|
|
4886
|
+
/// (replica 0) │ 1a 2a 3a (log_view=a, partitioned)
|
|
4887
|
+
/// replica 1 │ 4b 5b 6b (log_view=b, leader)
|
|
4888
|
+
/// replica 2 │ - - - (log_view=b, follower — but never receives any prepares)
|
|
4889
|
+
///
|
|
4890
|
+
/// After another view change, replica 2 loads replica 1's ops:
|
|
4891
|
+
///
|
|
4892
|
+
/// view=c
|
|
4893
|
+
/// op │ 0 1 2
|
|
4894
|
+
/// replica 0 │ 1a 2a 3a (log_view=c, follower)
|
|
4895
|
+
/// (replica 1) │ 4b 5b 6b (log_view=b, partitioned)
|
|
4896
|
+
/// replica 2 │ 1c 2c 3c (log_view=c, leader)
|
|
4897
|
+
///
|
|
4898
|
+
/// Suppose replica 0 crashes and its log_view regresses to a.
|
|
4899
|
+
/// If replica 2 is partitioned, replicas 0 and 1 start view d with the DVCs:
|
|
4900
|
+
///
|
|
4901
|
+
/// replica 0 │ 1a 2a 3a (log_view=a, log_view backtracked!)
|
|
4902
|
+
/// replica 1 │ 4b 5b 6b (log_view=b)
|
|
4903
|
+
///
|
|
4904
|
+
/// Replica 1's higher log_view is canonical, so 4b/5b/6b replace 1a/2a/3a even though
|
|
4905
|
+
/// the latter may have been committed during view c. The log has forked.
|
|
4906
|
+
///
|
|
4907
|
+
/// Therefore, a replica's log_view must never regress.
|
|
4908
|
+
///
|
|
4909
|
+
/// Equivalent to `superblock.working.vsr_state.log_view`.
|
|
4910
|
+
fn log_view_durable(self: *const Self) u32 {
|
|
4911
|
+
return self.superblock.working.vsr_state.log_view;
|
|
4912
|
+
}
|
|
4913
|
+
|
|
4914
|
+
fn view_durable_updating(self: *const Self) bool {
|
|
4915
|
+
return self.superblock.view_change_in_progress();
|
|
4916
|
+
}
|
|
4917
|
+
|
|
4918
|
+
/// Persist the current view and log_view to the superblock.
|
|
4919
|
+
/// `view_durable` and `log_view_durable` will update asynchronously, when their respective
|
|
4920
|
+
/// updates are durable.
|
|
4921
|
+
fn view_durable_update(self: *Self) void {
|
|
4922
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
4923
|
+
assert(self.view >= self.log_view);
|
|
4924
|
+
assert(self.view >= self.view_durable());
|
|
4925
|
+
assert(self.log_view >= self.log_view_durable());
|
|
4926
|
+
assert(self.log_view > self.log_view_durable() or self.view > self.view_durable());
|
|
4927
|
+
// The primary must only persist the SV headers after repairs are done.
|
|
4928
|
+
// Otherwise headers could be nacked, truncated, then restored after a crash.
|
|
4929
|
+
assert(self.log_view < self.view or self.replica != self.primary_index(self.view) or
|
|
4930
|
+
self.status == .normal);
|
|
4931
|
+
|
|
4932
|
+
if (self.view_durable_updating()) return;
|
|
4933
|
+
|
|
4934
|
+
log.debug("{}: view_durable_update: view_durable={}..{} log_view_durable={}..{}", .{
|
|
4935
|
+
self.replica,
|
|
4936
|
+
self.view_durable(),
|
|
4937
|
+
self.view,
|
|
4938
|
+
self.log_view_durable(),
|
|
4939
|
+
self.log_view,
|
|
4940
|
+
});
|
|
4941
|
+
|
|
4942
|
+
self.superblock.view_change(
|
|
4943
|
+
view_durable_update_callback,
|
|
4944
|
+
&self.superblock_context_view_change,
|
|
4945
|
+
.{
|
|
4946
|
+
.commit_max = self.commit_max,
|
|
4947
|
+
.view = self.view,
|
|
4948
|
+
.log_view = self.log_view,
|
|
4949
|
+
.headers = self.create_view_change_headers(),
|
|
4950
|
+
},
|
|
4951
|
+
);
|
|
4952
|
+
assert(self.view_durable_updating());
|
|
4953
|
+
}
|
|
4954
|
+
|
|
4955
|
+
fn view_durable_update_callback(context: *SuperBlock.Context) void {
|
|
4956
|
+
const self = @fieldParentPtr(Self, "superblock_context_view_change", context);
|
|
4957
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
4958
|
+
assert(!self.view_durable_updating());
|
|
4959
|
+
assert(self.superblock.working.vsr_state.view <= self.view);
|
|
4960
|
+
assert(self.superblock.working.vsr_state.log_view <= self.log_view);
|
|
4961
|
+
assert(self.superblock.working.vsr_state.commit_min <= self.commit_min);
|
|
4962
|
+
assert(self.superblock.working.vsr_state.commit_max <= self.commit_max);
|
|
4963
|
+
|
|
4964
|
+
log.debug("{}: view_durable_update_callback: " ++
|
|
4965
|
+
"(view_durable={} log_view_durable={})", .{
|
|
4966
|
+
self.replica,
|
|
4967
|
+
self.view_durable(),
|
|
4968
|
+
self.log_view_durable(),
|
|
4969
|
+
});
|
|
4970
|
+
|
|
4971
|
+
assert(self.view_durable() <= self.view);
|
|
4972
|
+
assert(self.log_view_durable() <= self.view_durable());
|
|
4973
|
+
assert(self.log_view_durable() <= self.log_view);
|
|
4974
|
+
|
|
4975
|
+
// The view/log_view incremented while the previous view-change update was being saved.
|
|
4976
|
+
const update = self.log_view_durable() < self.log_view or
|
|
4977
|
+
self.view_durable() < self.view;
|
|
4978
|
+
|
|
4979
|
+
const update_dvc = update and self.log_view < self.view;
|
|
4980
|
+
const update_sv = update and self.log_view == self.view and
|
|
4981
|
+
(self.replica != self.primary_index(self.view) or self.status == .normal);
|
|
4982
|
+
assert(!(update_dvc and update_sv));
|
|
4983
|
+
|
|
4984
|
+
if (update_dvc or update_sv) self.view_durable_update();
|
|
4985
|
+
}
|
|
4986
|
+
|
|
5122
4987
|
fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
|
|
5123
4988
|
assert(self.status == .view_change or self.status == .recovering);
|
|
5124
4989
|
|
|
@@ -5130,6 +4995,7 @@ pub fn ReplicaType(
|
|
|
5130
4995
|
// It will be set shortly, when we transition to normal status.
|
|
5131
4996
|
assert(self.view == 0);
|
|
5132
4997
|
},
|
|
4998
|
+
.recovering_head => unreachable,
|
|
5133
4999
|
}
|
|
5134
5000
|
|
|
5135
5001
|
// Uncommitted ops may not survive a view change so we must assert `op` against
|
|
@@ -5156,9 +5022,7 @@ pub fn ReplicaType(
|
|
|
5156
5022
|
});
|
|
5157
5023
|
}
|
|
5158
5024
|
|
|
5159
|
-
assert(commit_max >=
|
|
5160
|
-
self.commit_max - std.math.min(constants.pipeline_max, self.commit_max));
|
|
5161
|
-
|
|
5025
|
+
assert(commit_max >= self.commit_max -| constants.pipeline_prepare_queue_max);
|
|
5162
5026
|
assert(self.commit_min <= self.commit_max);
|
|
5163
5027
|
assert(self.op >= self.commit_max or self.op < self.commit_max);
|
|
5164
5028
|
|
|
@@ -5201,48 +5065,84 @@ pub fn ReplicaType(
|
|
|
5201
5065
|
/// where the new primary's headers depends on which of replica 1 and 2's DVC is used
|
|
5202
5066
|
/// for repair before the other (i.e. whether they repair op 6 or 7 first).
|
|
5203
5067
|
///
|
|
5204
|
-
/// For the above case to occur, replicas 0, 1, and 2 must all share the highest `
|
|
5205
|
-
/// And since they share the latest `
|
|
5068
|
+
/// For the above case to occur, replicas 0, 1, and 2 must all share the highest `log_view`.
|
|
5069
|
+
/// And since they share the latest `log_view`, ops 5,6,7 were just installed by
|
|
5206
5070
|
/// `replace_header`, which is order-independent (it doesn't use the hash chain).
|
|
5207
5071
|
///
|
|
5208
|
-
/// (If replica 0's
|
|
5072
|
+
/// (If replica 0's log_view was greater than 1/2's, then replica 0 must have all
|
|
5209
5073
|
/// headers from previous views. Which means 6,7 are from the current view. But since
|
|
5210
|
-
/// replica 0 doesn't have 6/7, then replica 1/2 must share the latest
|
|
5074
|
+
/// replica 0 doesn't have 6/7, then replica 1/2 must share the latest log_view. ∎)
|
|
5211
5075
|
fn primary_set_log_from_do_view_change_messages(self: *Self) void {
|
|
5212
5076
|
assert(self.status == .view_change);
|
|
5213
5077
|
assert(self.primary_index(self.view) == self.replica);
|
|
5214
5078
|
assert(self.replica_count > 1);
|
|
5215
5079
|
assert(self.start_view_change_quorum);
|
|
5216
5080
|
assert(self.do_view_change_quorum);
|
|
5081
|
+
assert(self.do_view_change_from_all_replicas[self.replica] != null);
|
|
5082
|
+
DVCQuorum.verify(self.do_view_change_from_all_replicas);
|
|
5217
5083
|
|
|
5218
|
-
const
|
|
5219
|
-
assert(
|
|
5220
|
-
assert(do_view_change_head.op >= self.commit_min);
|
|
5221
|
-
assert(do_view_change_head.op >= do_view_change_head.commit_min_max);
|
|
5222
|
-
assert(do_view_change_head.commit_min_max >= self.commit_min);
|
|
5084
|
+
const dvcs_all = DVCQuorum.dvcs_all(self.do_view_change_from_all_replicas);
|
|
5085
|
+
assert(dvcs_all.len == self.quorum_view_change);
|
|
5223
5086
|
|
|
5224
|
-
|
|
5225
|
-
|
|
5087
|
+
const dvcs_canonical = DVCQuorum.dvcs_canonical(self.do_view_change_from_all_replicas);
|
|
5088
|
+
assert(dvcs_canonical.len > 0);
|
|
5089
|
+
|
|
5090
|
+
for (dvcs_all.constSlice()) |message| {
|
|
5091
|
+
log.debug(
|
|
5092
|
+
"{}: on_do_view_change: dvc: " ++
|
|
5093
|
+
"replica={} log_view={} op={} commit_min={}",
|
|
5094
|
+
.{
|
|
5095
|
+
self.replica,
|
|
5096
|
+
message.header.replica,
|
|
5097
|
+
@intCast(u32, message.header.timestamp),
|
|
5098
|
+
message.header.op,
|
|
5099
|
+
message.header.commit, // The `commit_min` of the replica.
|
|
5100
|
+
},
|
|
5101
|
+
);
|
|
5102
|
+
}
|
|
5103
|
+
|
|
5104
|
+
for (dvcs_canonical.constSlice()) |message| {
|
|
5105
|
+
for (message_body_as_headers_chain_disjoint(message)) |*header| {
|
|
5106
|
+
log.debug(
|
|
5107
|
+
"{}: on_do_view_change: canonical: replica={} op={} checksum={}",
|
|
5108
|
+
.{
|
|
5109
|
+
self.replica,
|
|
5110
|
+
message.header.replica,
|
|
5111
|
+
header.op,
|
|
5112
|
+
header.checksum,
|
|
5113
|
+
},
|
|
5114
|
+
);
|
|
5115
|
+
}
|
|
5116
|
+
}
|
|
5117
|
+
|
|
5118
|
+
const do_view_change_commit_min_max = DVCQuorum.commit_min_max(
|
|
5119
|
+
self.do_view_change_from_all_replicas,
|
|
5120
|
+
.{
|
|
5121
|
+
.replica = self.replica,
|
|
5122
|
+
.commit_min = self.commit_min,
|
|
5123
|
+
},
|
|
5124
|
+
);
|
|
5125
|
+
assert(do_view_change_commit_min_max >= self.commit_min);
|
|
5126
|
+
|
|
5127
|
+
// The `prepare_timestamp` prevents a primary's own clock from running backwards.
|
|
5128
|
+
// Therefore, `prepare_timestamp`:
|
|
5226
5129
|
// 1. is advanced if behind the cluster, but never reset if ahead of the cluster, i.e.
|
|
5227
5130
|
// 2. may not always reflect the timestamp of the latest prepared op, and
|
|
5228
5131
|
// 3. should be advanced before discarding the timestamps of any uncommitted headers.
|
|
5229
|
-
|
|
5230
|
-
|
|
5132
|
+
const timestamp_max = DVCQuorum.timestamp_max(self.do_view_change_from_all_replicas);
|
|
5133
|
+
if (self.state_machine.prepare_timestamp < timestamp_max) {
|
|
5134
|
+
self.state_machine.prepare_timestamp = timestamp_max;
|
|
5231
5135
|
}
|
|
5232
5136
|
|
|
5233
|
-
|
|
5234
|
-
|
|
5235
|
-
|
|
5236
|
-
|
|
5237
|
-
|
|
5238
|
-
|
|
5239
|
-
|
|
5240
|
-
|
|
5241
|
-
|
|
5242
|
-
assert(op_canonical >= self.op -| constants.pipeline_max);
|
|
5243
|
-
assert(op_canonical >= self.commit_min);
|
|
5244
|
-
|
|
5245
|
-
if (do_view_change_head.op > self.op_checkpoint_trigger()) {
|
|
5137
|
+
var headers_canonical = DVCQuorum.headers_canonical(self.do_view_change_from_all_replicas);
|
|
5138
|
+
const header_head = headers_canonical.next().?;
|
|
5139
|
+
assert(header_head.op == header_head.op);
|
|
5140
|
+
assert(header_head.op >= do_view_change_commit_min_max);
|
|
5141
|
+
assert(header_head.op >= self.op_checkpoint());
|
|
5142
|
+
assert(header_head.op >= self.commit_min);
|
|
5143
|
+
assert(header_head.op >= self.commit_max);
|
|
5144
|
+
|
|
5145
|
+
if (header_head.op > self.op_checkpoint_trigger()) {
|
|
5246
5146
|
// This replica is too far behind, i.e. the new `self.op` is too far ahead of the
|
|
5247
5147
|
// last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
|
|
5248
5148
|
// precluding recovery.
|
|
@@ -5253,32 +5153,40 @@ pub fn ReplicaType(
|
|
|
5253
5153
|
}
|
|
5254
5154
|
|
|
5255
5155
|
self.set_op_and_commit_max(
|
|
5256
|
-
|
|
5257
|
-
// `set_op_and_commit_max()` expects the highest commit_max that we know of.
|
|
5258
|
-
// But DVCs include replica's `commit_min`, not `commit_max`.
|
|
5156
|
+
header_head.op,
|
|
5259
5157
|
std.math.max(
|
|
5260
5158
|
self.commit_max,
|
|
5261
|
-
|
|
5159
|
+
std.math.max(
|
|
5160
|
+
// `set_op_and_commit_max()` expects the highest commit_max that we know of.
|
|
5161
|
+
// But DVCs include replica's `commit_min`, not `commit_max`.
|
|
5162
|
+
do_view_change_commit_min_max,
|
|
5163
|
+
// An op cannot be uncommitted if it is definitely outside the pipeline.
|
|
5164
|
+
// Use `do_view_change_op_head` instead of `replica.op` since the former is
|
|
5165
|
+
// about to become the new `replica.op`.
|
|
5166
|
+
header_head.op -| constants.pipeline_prepare_queue_max,
|
|
5167
|
+
),
|
|
5262
5168
|
),
|
|
5263
5169
|
"on_do_view_change",
|
|
5264
5170
|
);
|
|
5265
|
-
// "`replica.op` exists" invariant may be broken
|
|
5266
|
-
//
|
|
5267
|
-
|
|
5268
|
-
|
|
5269
|
-
for (self.do_view_change_from_all_replicas) |received| {
|
|
5270
|
-
if (received) |message| {
|
|
5271
|
-
const view_normal = @intCast(u32, message.header.timestamp);
|
|
5272
|
-
// The view in which this replica's status was normal must be before this view.
|
|
5273
|
-
assert(view_normal < message.header.view);
|
|
5171
|
+
// "`replica.op` exists" invariant may be broken briefly between set_op_and_commit_max()
|
|
5172
|
+
// and replace_header().
|
|
5173
|
+
self.replace_header(&header_head);
|
|
5174
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
5274
5175
|
|
|
5275
|
-
|
|
5276
|
-
|
|
5176
|
+
while (headers_canonical.next()) |header| {
|
|
5177
|
+
assert(header.op < header_head.op);
|
|
5178
|
+
self.replace_header(&header);
|
|
5179
|
+
}
|
|
5277
5180
|
|
|
5278
|
-
|
|
5279
|
-
|
|
5181
|
+
const dvcs_uncanonical =
|
|
5182
|
+
DVCQuorum.dvcs_uncanonical(self.do_view_change_from_all_replicas);
|
|
5183
|
+
for (dvcs_uncanonical.constSlice()) |message| {
|
|
5184
|
+
for (message_body_as_headers_chain_disjoint(message)) |*header| {
|
|
5185
|
+
// We must trust headers that other replicas have committed, because
|
|
5186
|
+
// repair_header() will not repair a header if the hash chain has a gap.
|
|
5187
|
+
if (header.op <= message.header.commit) {
|
|
5280
5188
|
log.debug(
|
|
5281
|
-
"{}: on_do_view_change:
|
|
5189
|
+
"{}: on_do_view_change: committed: replica={} op={} checksum={}",
|
|
5282
5190
|
.{
|
|
5283
5191
|
self.replica,
|
|
5284
5192
|
message.header.replica,
|
|
@@ -5286,295 +5194,98 @@ pub fn ReplicaType(
|
|
|
5286
5194
|
header.checksum,
|
|
5287
5195
|
},
|
|
5288
5196
|
);
|
|
5289
|
-
|
|
5290
|
-
|
|
5291
|
-
|
|
5292
|
-
}
|
|
5293
|
-
|
|
5294
|
-
// Since we used do_view_change_head to set the replica.op, it must have been loaded
|
|
5295
|
-
// into the headers (if it wasn't present already).
|
|
5296
|
-
assert(self.journal.header_with_op(self.op) != null);
|
|
5297
|
-
|
|
5298
|
-
// Now that the canonical headers are all in place, repair any other headers:
|
|
5299
|
-
for (self.do_view_change_from_all_replicas) |received| {
|
|
5300
|
-
if (received) |message| {
|
|
5301
|
-
const view_normal = @intCast(u32, message.header.timestamp);
|
|
5302
|
-
assert(view_normal < message.header.view);
|
|
5303
|
-
|
|
5304
|
-
if (view_normal == view_normal_canonical) continue;
|
|
5305
|
-
assert(view_normal < view_normal_canonical);
|
|
5306
|
-
|
|
5307
|
-
for (message_body_as_headers(message)) |*header| {
|
|
5308
|
-
// We must trust headers that other replicas have committed, because
|
|
5309
|
-
// repair_header() will not repair a header if the hash chain has a gap.
|
|
5310
|
-
if (header.op <= message.header.commit) {
|
|
5311
|
-
log.debug(
|
|
5312
|
-
"{}: on_do_view_change: committed: replica={} op={} checksum={}",
|
|
5313
|
-
.{
|
|
5314
|
-
self.replica,
|
|
5315
|
-
message.header.replica,
|
|
5316
|
-
header.op,
|
|
5317
|
-
header.checksum,
|
|
5318
|
-
},
|
|
5319
|
-
);
|
|
5320
|
-
self.replace_header(header);
|
|
5321
|
-
} else {
|
|
5322
|
-
_ = self.repair_header(header);
|
|
5323
|
-
}
|
|
5197
|
+
self.replace_header(header);
|
|
5198
|
+
} else {
|
|
5199
|
+
_ = self.repair_header(header);
|
|
5324
5200
|
}
|
|
5325
5201
|
}
|
|
5326
5202
|
}
|
|
5327
|
-
|
|
5328
|
-
const op_max = self.do_view_change_op_max(op_canonical);
|
|
5329
|
-
assert(op_max <= self.op);
|
|
5330
|
-
assert(op_max >= self.commit_min);
|
|
5331
|
-
if (op_max != self.op) {
|
|
5332
|
-
log.debug("{}: primary_set_log_from_do_view_change_messages: discard op={}..{}", .{
|
|
5333
|
-
self.replica,
|
|
5334
|
-
op_max + 1,
|
|
5335
|
-
self.op,
|
|
5336
|
-
});
|
|
5337
|
-
self.journal.remove_entries_from(op_max + 1);
|
|
5338
|
-
self.op = op_max;
|
|
5339
|
-
}
|
|
5340
|
-
assert(self.journal.header_with_op(self.op) != null);
|
|
5341
5203
|
}
|
|
5342
5204
|
|
|
5343
|
-
fn
|
|
5344
|
-
/// The highest `view_normal` of any DVC.
|
|
5345
|
-
///
|
|
5346
|
-
/// The headers bundled with DVCs with the highest `view_normal` are canonical, since
|
|
5347
|
-
/// the replica has knowledge of previous view changes in which headers were replaced.
|
|
5348
|
-
view_normal: u32,
|
|
5349
|
-
/// The highest `commit_min` from any DVC (this is not a `commit_max`).
|
|
5350
|
-
commit_min_max: u64,
|
|
5351
|
-
/// The highest `op` from a DVC with the highest `view_normal`.
|
|
5352
|
-
op: u64,
|
|
5353
|
-
/// The higest timestamp from any DVC.
|
|
5354
|
-
timestamp: u64,
|
|
5355
|
-
} {
|
|
5205
|
+
fn primary_start_view_as_the_new_primary(self: *Self) void {
|
|
5356
5206
|
assert(self.status == .view_change);
|
|
5357
5207
|
assert(self.primary_index(self.view) == self.replica);
|
|
5358
|
-
assert(self.
|
|
5359
|
-
assert(self.start_view_change_quorum);
|
|
5208
|
+
assert(self.view == self.log_view);
|
|
5360
5209
|
assert(self.do_view_change_quorum);
|
|
5361
|
-
assert(self.
|
|
5362
|
-
|
|
5363
|
-
var v: ?u32 = null; // The highest `view_normal` from any replica.
|
|
5364
|
-
var n: ?u64 = null; // The highest `op` for the highest `view_normal` from any replica.
|
|
5365
|
-
var k: ?u64 = null; // The highest `commit_min` from any replica.
|
|
5366
|
-
var t: ?u64 = null; // The highest `timestamp` from any replica.
|
|
5367
|
-
|
|
5368
|
-
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
5369
|
-
if (received) |message| {
|
|
5370
|
-
assert(message.header.command == .do_view_change);
|
|
5371
|
-
assert(message.header.cluster == self.cluster);
|
|
5372
|
-
assert(message.header.replica == replica);
|
|
5373
|
-
assert(message.header.view == self.view);
|
|
5374
|
-
assert(message.header.op >= message.header.commit);
|
|
5375
|
-
assert(message.header.op - message.header.commit <= constants.journal_slot_count);
|
|
5376
|
-
|
|
5377
|
-
// The view when this replica was last in normal status, which:
|
|
5378
|
-
// * may be higher than the view in any of the prepare headers.
|
|
5379
|
-
// * must be lower than the view of this view change.
|
|
5380
|
-
const view_normal = @intCast(u32, message.header.timestamp);
|
|
5381
|
-
assert(view_normal < message.header.view);
|
|
5382
|
-
|
|
5383
|
-
if (replica == self.replica) {
|
|
5384
|
-
assert(view_normal == self.view_normal);
|
|
5385
|
-
assert(message.header.op == self.op);
|
|
5386
|
-
// We may have a newer commit than our DVC due to async commits (see below).
|
|
5387
|
-
assert(message.header.commit <= self.commit_min);
|
|
5388
|
-
}
|
|
5210
|
+
assert(!self.pipeline_repairing);
|
|
5211
|
+
assert(self.primary_repair_pipeline() == .done);
|
|
5389
5212
|
|
|
5390
|
-
|
|
5391
|
-
|
|
5392
|
-
|
|
5393
|
-
|
|
5394
|
-
|
|
5395
|
-
message.header.replica,
|
|
5396
|
-
view_normal,
|
|
5397
|
-
message.header.op,
|
|
5398
|
-
message.header.commit, // The `commit_min` of the replica.
|
|
5399
|
-
},
|
|
5400
|
-
);
|
|
5213
|
+
assert(self.commit_min == self.commit_max);
|
|
5214
|
+
assert(self.journal.dirty.count == 0);
|
|
5215
|
+
assert(self.journal.faulty.count == 0);
|
|
5216
|
+
assert(self.nack_prepare_op == null);
|
|
5217
|
+
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
5401
5218
|
|
|
5402
|
-
|
|
5403
|
-
|
|
5404
|
-
|
|
5405
|
-
|
|
5406
|
-
|
|
5407
|
-
|
|
5219
|
+
{
|
|
5220
|
+
const pipeline_queue = self.primary_repair_pipeline_done();
|
|
5221
|
+
assert(pipeline_queue.request_queue.empty());
|
|
5222
|
+
assert(pipeline_queue.prepare_queue.count + self.commit_max == self.op);
|
|
5223
|
+
if (!pipeline_queue.prepare_queue.empty()) {
|
|
5224
|
+
const prepares = &pipeline_queue.prepare_queue;
|
|
5225
|
+
assert(prepares.head_ptr_const().?.message.header.op == self.commit_max + 1);
|
|
5226
|
+
assert(prepares.tail_ptr_const().?.message.header.op == self.op);
|
|
5227
|
+
}
|
|
5408
5228
|
|
|
5409
|
-
|
|
5229
|
+
var pipeline_prepares = pipeline_queue.prepare_queue.iterator();
|
|
5230
|
+
while (pipeline_prepares.next()) |prepare| {
|
|
5231
|
+
assert(self.journal.has(prepare.message.header));
|
|
5232
|
+
assert(!prepare.ok_quorum_received);
|
|
5233
|
+
assert(prepare.ok_from_all_replicas.count() == 0);
|
|
5410
5234
|
|
|
5411
|
-
|
|
5412
|
-
|
|
5413
|
-
|
|
5414
|
-
|
|
5235
|
+
log.debug("{}: start_view_as_the_new_primary: pipeline " ++
|
|
5236
|
+
"(op={} checksum={x} parent={x})", .{
|
|
5237
|
+
self.replica,
|
|
5238
|
+
prepare.message.header.op,
|
|
5239
|
+
prepare.message.header.checksum,
|
|
5240
|
+
prepare.message.header.parent,
|
|
5241
|
+
});
|
|
5415
5242
|
}
|
|
5416
|
-
}
|
|
5417
5243
|
|
|
5418
|
-
|
|
5419
|
-
|
|
5420
|
-
|
|
5421
|
-
// 3. Finish committing op=N…M.
|
|
5422
|
-
// 4. Remaining `do_view_change` messages arrive, completing the quorum.
|
|
5423
|
-
// In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
|
|
5424
|
-
// Don't let the commit backtrack.
|
|
5425
|
-
if (k.? < self.commit_min) {
|
|
5426
|
-
assert(self.commit_min >
|
|
5427
|
-
self.do_view_change_from_all_replicas[self.replica].?.header.commit);
|
|
5428
|
-
log.debug("{}: on_do_view_change: bump commit_min view={} commit={}..{}", .{
|
|
5429
|
-
self.replica,
|
|
5430
|
-
self.view,
|
|
5431
|
-
k.?,
|
|
5432
|
-
self.commit_min,
|
|
5433
|
-
});
|
|
5434
|
-
k = self.commit_min;
|
|
5244
|
+
self.pipeline.cache.deinit(self.message_bus.pool);
|
|
5245
|
+
self.pipeline = .{ .queue = pipeline_queue };
|
|
5246
|
+
self.pipeline.queue.verify();
|
|
5435
5247
|
}
|
|
5436
5248
|
|
|
5437
|
-
assert(v.? >= self.view_normal);
|
|
5438
|
-
assert(k.? >= self.commit_min);
|
|
5439
|
-
|
|
5440
|
-
return .{
|
|
5441
|
-
.view_normal = v.?,
|
|
5442
|
-
.commit_min_max = k.?,
|
|
5443
|
-
.op = n.?,
|
|
5444
|
-
.timestamp = t.?,
|
|
5445
|
-
};
|
|
5446
|
-
}
|
|
5447
|
-
|
|
5448
|
-
/// Identify headers to discard during a view change before the primary starts the view.
|
|
5449
|
-
/// This is required to maximize availability in the presence of storage faults.
|
|
5450
|
-
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
5451
|
-
///
|
|
5452
|
-
/// Returns the highest op that:
|
|
5453
|
-
/// - precedes any hash chain breaks in the uncanonical headers, and
|
|
5454
|
-
/// - precedes any gaps in the uncommitted headers.
|
|
5455
|
-
///
|
|
5456
|
-
/// Breaks
|
|
5457
|
-
///
|
|
5458
|
-
/// If there is a hash chain break, none of the headers from the canonical DVCs replaced
|
|
5459
|
-
/// the broken (leftover uncanonical) op.
|
|
5460
|
-
/// Removing these is necessary for correctness and liveness, to ensure that
|
|
5461
|
-
/// disconnected headers do not remain in place in lieu of gaps.
|
|
5462
|
-
///
|
|
5463
|
-
/// Gaps
|
|
5464
|
-
///
|
|
5465
|
-
/// It is possible for the new primary to have done an op jump in a previous view, and
|
|
5466
|
-
/// introduced a header gap for an op, which may have then been discarded by another primary
|
|
5467
|
-
/// during a view change, before surviving into this view as a gap because our latest op was
|
|
5468
|
-
/// set as the latest op for the quorum.
|
|
5469
|
-
///
|
|
5470
|
-
/// In this case, it may be impossible for the new primary to repair the missing header as
|
|
5471
|
-
/// the rest of the cluster may have already discarded it. We therefore iterate over our
|
|
5472
|
-
/// uncommitted header gaps to discard any that may be impossible to repair.
|
|
5473
|
-
///
|
|
5474
|
-
/// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
|
|
5475
|
-
/// prepared on another replica before the old primary crashes, then this function finds a
|
|
5476
|
-
/// gap for ops=7,8 and will attempt to discard ops 7,8,9.
|
|
5477
|
-
fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
|
|
5478
|
-
assert(self.replica_count > 1);
|
|
5479
|
-
assert(self.status == .view_change);
|
|
5480
|
-
assert(self.primary_index(self.view) == self.replica);
|
|
5481
|
-
assert(self.do_view_change_quorum);
|
|
5482
|
-
assert(!self.repair_timeout.ticking);
|
|
5483
|
-
assert(self.op >= self.commit_max);
|
|
5484
|
-
// At least one replica in the new quorum committed in the new replica.op's WAL wrap —
|
|
5485
|
-
// wrapping implies a checkpoint (which implies a commit).
|
|
5486
|
-
assert(self.op - self.commit_max <= constants.journal_slot_count);
|
|
5487
|
-
assert(self.op - self.commit_min <= constants.journal_slot_count);
|
|
5488
|
-
|
|
5489
|
-
assert(op_canonical <= self.op);
|
|
5490
|
-
assert(op_canonical >= self.commit_min);
|
|
5491
|
-
|
|
5492
|
-
// Any uncanonical ops remaining either:
|
|
5493
|
-
// * Connect to the hash chain on the right.
|
|
5494
|
-
// * Do not connect on the right (hash chain break).
|
|
5495
|
-
//
|
|
5496
|
-
// If there is a hash chain break, none of the headers from the canonical DVCs replaced
|
|
5497
|
-
// the broken op. It is truncated like a gap.
|
|
5498
|
-
//
|
|
5499
|
-
// Removing these is necessary for correctness and liveness, to ensure that
|
|
5500
|
-
// disconnected headers do not remain in place in lieu of gaps.
|
|
5501
|
-
const op_before_break = blk: {
|
|
5502
|
-
var op: u64 = op_canonical;
|
|
5503
|
-
while (op < self.op) : (op += 1) {
|
|
5504
|
-
if (self.journal.header_with_op(op)) |header| {
|
|
5505
|
-
if (self.journal.header_with_op(op + 1)) |next| {
|
|
5506
|
-
// Broken hash chain.
|
|
5507
|
-
if (header.checksum != next.parent) break :blk op;
|
|
5508
|
-
}
|
|
5509
|
-
}
|
|
5510
|
-
} else break :blk self.op;
|
|
5511
|
-
};
|
|
5512
|
-
|
|
5513
|
-
// Find the beginning of the lowest gap.
|
|
5514
|
-
//
|
|
5515
|
-
// While iterating > commit_max does not in itself guarantee that an op is uncommitted
|
|
5516
|
-
// (the old primary may have committed the op shortly before crashing), nevertheless,
|
|
5517
|
-
// if it was committed it would have survived into the new view as a header not a gap.
|
|
5518
|
-
const op_before_gap = blk: {
|
|
5519
|
-
// An op cannot be uncommitted if it is definitely outside the pipeline.
|
|
5520
|
-
const op_committed = std.math.max(self.commit_max, self.op -| constants.pipeline_max);
|
|
5521
|
-
assert(op_committed <= self.op);
|
|
5522
|
-
|
|
5523
|
-
var op = op_committed;
|
|
5524
|
-
while (op < self.op) : (op += 1) {
|
|
5525
|
-
if (self.journal.header_with_op(op + 1) == null) break :blk op;
|
|
5526
|
-
} else break :blk self.op;
|
|
5527
|
-
};
|
|
5528
|
-
|
|
5529
|
-
return std.math.min(op_before_break, op_before_gap);
|
|
5530
|
-
}
|
|
5531
|
-
|
|
5532
|
-
fn start_view_as_the_new_primary(self: *Self) void {
|
|
5533
|
-
assert(self.status == .view_change);
|
|
5534
|
-
assert(self.primary_index(self.view) == self.replica);
|
|
5535
|
-
assert(self.do_view_change_quorum);
|
|
5536
|
-
assert(!self.repairing_pipeline);
|
|
5537
|
-
|
|
5538
|
-
assert(self.commit_min == self.commit_max);
|
|
5539
|
-
assert(self.primary_repair_pipeline_op() == null);
|
|
5540
|
-
self.verify_pipeline();
|
|
5541
|
-
assert(self.commit_max + self.pipeline.count == self.op);
|
|
5542
|
-
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
5543
|
-
|
|
5544
|
-
assert(self.journal.dirty.count == 0);
|
|
5545
|
-
assert(self.journal.faulty.count == 0);
|
|
5546
|
-
assert(self.nack_prepare_op == null);
|
|
5547
|
-
|
|
5548
|
-
const start_view = self.create_view_change_message(.start_view);
|
|
5549
|
-
defer self.message_bus.unref(start_view);
|
|
5550
|
-
|
|
5551
5249
|
self.transition_to_normal_from_view_change_status(self.view);
|
|
5552
|
-
|
|
5553
|
-
assert(self.commit_max + self.pipeline.count == self.op);
|
|
5250
|
+
self.view_durable_update();
|
|
5554
5251
|
|
|
5555
5252
|
assert(self.status == .normal);
|
|
5556
5253
|
assert(self.primary());
|
|
5557
5254
|
|
|
5558
|
-
assert(start_view.references == 1);
|
|
5559
|
-
assert(start_view.header.command == .start_view);
|
|
5560
|
-
assert(start_view.header.view == self.view);
|
|
5561
|
-
assert(start_view.header.op == self.op);
|
|
5562
|
-
assert(start_view.header.commit == self.commit_max);
|
|
5563
|
-
|
|
5564
5255
|
// Send prepare_ok messages to ourself to contribute to the pipeline.
|
|
5565
5256
|
self.send_prepare_oks_after_view_change();
|
|
5566
5257
|
|
|
5567
|
-
|
|
5258
|
+
// SVs will be sent out (via timeout) after the view_durable update completes.
|
|
5259
|
+
assert(self.view_durable_updating());
|
|
5260
|
+
assert(self.log_view > self.log_view_durable());
|
|
5568
5261
|
}
|
|
5569
5262
|
|
|
5570
|
-
fn
|
|
5263
|
+
fn transition_to_recovering_head(self: *Self) void {
|
|
5571
5264
|
assert(self.status == .recovering);
|
|
5572
|
-
assert(self.view ==
|
|
5265
|
+
assert(self.view == self.log_view);
|
|
5266
|
+
assert(self.op >= self.commit_min);
|
|
5267
|
+
assert(!self.committing);
|
|
5268
|
+
assert(self.replica_count > 1);
|
|
5269
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
5270
|
+
assert(self.pipeline == .cache);
|
|
5271
|
+
|
|
5272
|
+
self.status = .recovering_head;
|
|
5273
|
+
|
|
5274
|
+
log.warn("{}: transition_to_recovering_head: op_checkpoint={} op_head={}", .{
|
|
5275
|
+
self.replica,
|
|
5276
|
+
self.op_checkpoint(),
|
|
5277
|
+
self.op,
|
|
5278
|
+
});
|
|
5279
|
+
}
|
|
5280
|
+
|
|
5281
|
+
fn transition_to_normal_from_recovering_status(self: *Self) void {
|
|
5282
|
+
assert(self.status == .recovering or self.status == .recovering_head);
|
|
5283
|
+
assert(self.view == self.log_view);
|
|
5573
5284
|
assert(!self.committing);
|
|
5574
|
-
assert(self.replica_count > 1 or
|
|
5285
|
+
assert(self.replica_count > 1 or self.commit_min == self.op);
|
|
5575
5286
|
assert(self.journal.header_with_op(self.op) != null);
|
|
5576
|
-
self.
|
|
5577
|
-
|
|
5287
|
+
assert(self.pipeline == .cache);
|
|
5288
|
+
|
|
5578
5289
|
self.status = .normal;
|
|
5579
5290
|
|
|
5580
5291
|
if (self.primary()) {
|
|
@@ -5586,7 +5297,7 @@ pub fn ReplicaType(
|
|
|
5586
5297
|
},
|
|
5587
5298
|
);
|
|
5588
5299
|
|
|
5589
|
-
assert(self.
|
|
5300
|
+
assert(self.replica_count == 1);
|
|
5590
5301
|
assert(!self.prepare_timeout.ticking);
|
|
5591
5302
|
assert(!self.normal_status_timeout.ticking);
|
|
5592
5303
|
assert(!self.view_change_status_timeout.ticking);
|
|
@@ -5595,7 +5306,9 @@ pub fn ReplicaType(
|
|
|
5595
5306
|
self.ping_timeout.start();
|
|
5596
5307
|
self.commit_timeout.start();
|
|
5597
5308
|
self.repair_timeout.start();
|
|
5598
|
-
|
|
5309
|
+
|
|
5310
|
+
self.pipeline.cache.deinit(self.message_bus.pool);
|
|
5311
|
+
self.pipeline = .{ .queue = .{} };
|
|
5599
5312
|
} else {
|
|
5600
5313
|
log.debug(
|
|
5601
5314
|
"{}: transition_to_normal_from_recovering_status: view={} backup",
|
|
@@ -5613,31 +5326,30 @@ pub fn ReplicaType(
|
|
|
5613
5326
|
self.ping_timeout.start();
|
|
5614
5327
|
self.normal_status_timeout.start();
|
|
5615
5328
|
self.repair_timeout.start();
|
|
5616
|
-
self.recovery_timeout.stop();
|
|
5617
5329
|
}
|
|
5618
5330
|
}
|
|
5619
5331
|
|
|
5620
|
-
fn transition_to_normal_from_view_change_status(self: *Self,
|
|
5332
|
+
fn transition_to_normal_from_view_change_status(self: *Self, view_new: u32) void {
|
|
5621
5333
|
// In the VRR paper it's possible to transition from normal to normal for the same view.
|
|
5622
5334
|
// For example, this could happen after a state transfer triggered by an op jump.
|
|
5623
5335
|
assert(self.status == .view_change);
|
|
5624
|
-
assert(
|
|
5336
|
+
assert(view_new >= self.view);
|
|
5625
5337
|
assert(self.journal.header_with_op(self.op) != null);
|
|
5626
|
-
|
|
5627
|
-
self.view_normal = new_view;
|
|
5338
|
+
|
|
5628
5339
|
self.status = .normal;
|
|
5629
5340
|
|
|
5630
5341
|
if (self.primary()) {
|
|
5631
5342
|
log.debug(
|
|
5632
|
-
"{}: transition_to_normal_from_view_change_status: view={} primary",
|
|
5633
|
-
.{
|
|
5634
|
-
self.replica,
|
|
5635
|
-
self.view,
|
|
5636
|
-
},
|
|
5343
|
+
"{}: transition_to_normal_from_view_change_status: view={}..{} primary",
|
|
5344
|
+
.{ self.replica, self.view, view_new },
|
|
5637
5345
|
);
|
|
5638
5346
|
|
|
5639
5347
|
assert(!self.prepare_timeout.ticking);
|
|
5640
|
-
assert(!self.
|
|
5348
|
+
assert(!self.pipeline_repairing);
|
|
5349
|
+
assert(self.pipeline == .queue);
|
|
5350
|
+
assert(self.view == view_new);
|
|
5351
|
+
assert(self.log_view == view_new);
|
|
5352
|
+
assert(self.commit_min == self.commit_max);
|
|
5641
5353
|
|
|
5642
5354
|
self.ping_timeout.start();
|
|
5643
5355
|
self.commit_timeout.start();
|
|
@@ -5647,15 +5359,25 @@ pub fn ReplicaType(
|
|
|
5647
5359
|
self.repair_timeout.start();
|
|
5648
5360
|
|
|
5649
5361
|
// Do not reset the pipeline as there may be uncommitted ops to drive to completion.
|
|
5650
|
-
if (self.pipeline.count > 0) self.prepare_timeout.start();
|
|
5362
|
+
if (self.pipeline.queue.prepare_queue.count > 0) self.prepare_timeout.start();
|
|
5651
5363
|
} else {
|
|
5652
|
-
log.debug("{}: transition_to_normal_from_view_change_status: view={} backup", .{
|
|
5364
|
+
log.debug("{}: transition_to_normal_from_view_change_status: view={}..{} backup", .{
|
|
5653
5365
|
self.replica,
|
|
5654
5366
|
self.view,
|
|
5367
|
+
view_new,
|
|
5655
5368
|
});
|
|
5656
5369
|
|
|
5657
5370
|
assert(!self.prepare_timeout.ticking);
|
|
5658
|
-
assert(
|
|
5371
|
+
assert(self.pipeline == .cache);
|
|
5372
|
+
|
|
5373
|
+
if (self.log_view == view_new and self.view == view_new) {
|
|
5374
|
+
// We recovered into the same view we crashed in, with a detour through
|
|
5375
|
+
// status=recovering_head.
|
|
5376
|
+
} else {
|
|
5377
|
+
self.view = view_new;
|
|
5378
|
+
self.log_view = view_new;
|
|
5379
|
+
self.view_durable_update();
|
|
5380
|
+
}
|
|
5659
5381
|
|
|
5660
5382
|
self.ping_timeout.start();
|
|
5661
5383
|
self.commit_timeout.stop();
|
|
@@ -5668,7 +5390,6 @@ pub fn ReplicaType(
|
|
|
5668
5390
|
self.reset_quorum_start_view_change();
|
|
5669
5391
|
self.reset_quorum_do_view_change();
|
|
5670
5392
|
self.reset_quorum_nack_prepare();
|
|
5671
|
-
self.reset_quorum_prepare_ok();
|
|
5672
5393
|
|
|
5673
5394
|
assert(self.start_view_change_quorum == false);
|
|
5674
5395
|
assert(self.do_view_change_quorum == false);
|
|
@@ -5680,17 +5401,34 @@ pub fn ReplicaType(
|
|
|
5680
5401
|
/// where v identifies the new view. A replica notices the need for a view change either
|
|
5681
5402
|
/// based on its own timer, or because it receives a start_view_change or do_view_change
|
|
5682
5403
|
/// message for a view with a larger number than its own view.
|
|
5683
|
-
fn transition_to_view_change_status(self: *Self,
|
|
5404
|
+
fn transition_to_view_change_status(self: *Self, view_new: u32) void {
|
|
5684
5405
|
log.debug("{}: transition_to_view_change_status: view={}..{}", .{
|
|
5685
5406
|
self.replica,
|
|
5686
5407
|
self.view,
|
|
5687
|
-
|
|
5408
|
+
view_new,
|
|
5688
5409
|
});
|
|
5689
|
-
assert(self.status == .normal or
|
|
5690
|
-
|
|
5691
|
-
|
|
5410
|
+
assert(self.status == .normal or
|
|
5411
|
+
self.status == .view_change or
|
|
5412
|
+
self.status == .recovering or
|
|
5413
|
+
self.status == .recovering_head);
|
|
5414
|
+
|
|
5415
|
+
const status_before = self.status;
|
|
5692
5416
|
self.status = .view_change;
|
|
5693
5417
|
|
|
5418
|
+
if (self.view == view_new) {
|
|
5419
|
+
assert(status_before == .recovering or status_before == .recovering_head);
|
|
5420
|
+
} else {
|
|
5421
|
+
assert(view_new > self.view);
|
|
5422
|
+
self.view = view_new;
|
|
5423
|
+
self.view_durable_update();
|
|
5424
|
+
}
|
|
5425
|
+
|
|
5426
|
+
if (self.pipeline == .queue) {
|
|
5427
|
+
var queue = self.pipeline.queue;
|
|
5428
|
+
self.pipeline = .{ .cache = PipelineCache.init_from_queue(&queue) };
|
|
5429
|
+
queue.deinit(self.message_bus.pool);
|
|
5430
|
+
}
|
|
5431
|
+
|
|
5694
5432
|
self.ping_timeout.stop();
|
|
5695
5433
|
self.commit_timeout.stop();
|
|
5696
5434
|
self.normal_status_timeout.stop();
|
|
@@ -5698,7 +5436,6 @@ pub fn ReplicaType(
|
|
|
5698
5436
|
self.view_change_message_timeout.start();
|
|
5699
5437
|
self.repair_timeout.stop();
|
|
5700
5438
|
self.prepare_timeout.stop();
|
|
5701
|
-
assert(!self.recovery_timeout.ticking);
|
|
5702
5439
|
|
|
5703
5440
|
// Do not reset quorum counters only on entering a view, assuming that the view will be
|
|
5704
5441
|
// followed only by a single subsequent view change to the next view, because multiple
|
|
@@ -5708,7 +5445,6 @@ pub fn ReplicaType(
|
|
|
5708
5445
|
self.reset_quorum_start_view_change();
|
|
5709
5446
|
self.reset_quorum_do_view_change();
|
|
5710
5447
|
self.reset_quorum_nack_prepare();
|
|
5711
|
-
self.reset_quorum_prepare_ok();
|
|
5712
5448
|
|
|
5713
5449
|
assert(self.start_view_change_quorum == false);
|
|
5714
5450
|
assert(self.do_view_change_quorum == false);
|
|
@@ -5785,7 +5521,7 @@ pub fn ReplicaType(
|
|
|
5785
5521
|
fn valid_hash_chain_between(self: *const Self, op_min: u64, op_max: u64) bool {
|
|
5786
5522
|
assert(op_min <= op_max);
|
|
5787
5523
|
// Headers with ops preceding the checkpoint may be unavailable due to a WAL wrap.
|
|
5788
|
-
assert(op_min >= self.op_checkpoint);
|
|
5524
|
+
assert(op_min >= self.op_checkpoint());
|
|
5789
5525
|
|
|
5790
5526
|
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
5791
5527
|
// that have since been reordered by a new primary.
|
|
@@ -5796,7 +5532,7 @@ pub fn ReplicaType(
|
|
|
5796
5532
|
while (op > op_min) {
|
|
5797
5533
|
op -= 1;
|
|
5798
5534
|
|
|
5799
|
-
if (self.op_checkpoint == op) {
|
|
5535
|
+
if (self.op_checkpoint() == op) {
|
|
5800
5536
|
// op_checkpoint's slot may have been overwritten in the WAL — but we can
|
|
5801
5537
|
// always use the VSRState to anchor the hash chain.
|
|
5802
5538
|
assert(op == op_min);
|
|
@@ -5807,7 +5543,7 @@ pub fn ReplicaType(
|
|
|
5807
5543
|
log.debug("{}: valid_hash_chain_between: break A: {} (checkpoint={})", .{
|
|
5808
5544
|
self.replica,
|
|
5809
5545
|
self.superblock.working.vsr_state.commit_min_checksum,
|
|
5810
|
-
self.op_checkpoint,
|
|
5546
|
+
self.op_checkpoint(),
|
|
5811
5547
|
});
|
|
5812
5548
|
log.debug("{}: valid_hash_chain_between: break B: {}", .{
|
|
5813
5549
|
self.replica,
|
|
@@ -5836,37 +5572,6 @@ pub fn ReplicaType(
|
|
|
5836
5572
|
return true;
|
|
5837
5573
|
}
|
|
5838
5574
|
|
|
5839
|
-
fn verify_pipeline(self: *Self) void {
|
|
5840
|
-
assert(self.status == .view_change);
|
|
5841
|
-
|
|
5842
|
-
var op = self.commit_max + 1;
|
|
5843
|
-
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
5844
|
-
|
|
5845
|
-
var iterator = self.pipeline.iterator();
|
|
5846
|
-
while (iterator.next_ptr()) |prepare| {
|
|
5847
|
-
assert(prepare.message.header.command == .prepare);
|
|
5848
|
-
assert(!prepare.ok_quorum_received);
|
|
5849
|
-
assert(prepare.ok_from_all_replicas.count() == 0);
|
|
5850
|
-
|
|
5851
|
-
log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
|
|
5852
|
-
self.replica,
|
|
5853
|
-
prepare.message.header.op,
|
|
5854
|
-
prepare.message.header.checksum,
|
|
5855
|
-
prepare.message.header.parent,
|
|
5856
|
-
});
|
|
5857
|
-
|
|
5858
|
-
assert(self.journal.has(prepare.message.header));
|
|
5859
|
-
assert(prepare.message.header.op == op);
|
|
5860
|
-
assert(prepare.message.header.op <= self.op);
|
|
5861
|
-
assert(prepare.message.header.parent == parent);
|
|
5862
|
-
|
|
5863
|
-
parent = prepare.message.header.checksum;
|
|
5864
|
-
op += 1;
|
|
5865
|
-
}
|
|
5866
|
-
assert(self.pipeline.count <= constants.pipeline_max);
|
|
5867
|
-
assert(self.commit_max + self.pipeline.count == op - 1);
|
|
5868
|
-
}
|
|
5869
|
-
|
|
5870
5575
|
fn view_jump(self: *Self, header: *const Header) void {
|
|
5871
5576
|
const to: Status = switch (header.command) {
|
|
5872
5577
|
.prepare, .commit => .normal,
|
|
@@ -5874,7 +5579,10 @@ pub fn ReplicaType(
|
|
|
5874
5579
|
else => unreachable,
|
|
5875
5580
|
};
|
|
5876
5581
|
|
|
5877
|
-
|
|
5582
|
+
switch (self.status) {
|
|
5583
|
+
.normal, .view_change, .recovering_head => {},
|
|
5584
|
+
.recovering => return,
|
|
5585
|
+
}
|
|
5878
5586
|
|
|
5879
5587
|
if (header.view < self.view) return;
|
|
5880
5588
|
|
|
@@ -5898,18 +5606,20 @@ pub fn ReplicaType(
|
|
|
5898
5606
|
.view_change => if (header.view == self.view) return,
|
|
5899
5607
|
else => unreachable,
|
|
5900
5608
|
},
|
|
5609
|
+
.recovering_head => {},
|
|
5901
5610
|
else => unreachable,
|
|
5902
5611
|
}
|
|
5903
5612
|
|
|
5904
5613
|
switch (to) {
|
|
5905
5614
|
.normal => {
|
|
5906
5615
|
if (header.view == self.view) {
|
|
5907
|
-
assert(self.status == .view_change);
|
|
5616
|
+
assert(self.status == .view_change or self.status == .recovering_head);
|
|
5908
5617
|
|
|
5909
5618
|
log.debug("{}: view_jump: waiting to exit view change", .{self.replica});
|
|
5910
5619
|
} else {
|
|
5911
5620
|
assert(header.view > self.view);
|
|
5912
|
-
assert(self.status == .view_change or self.status == .
|
|
5621
|
+
assert(self.status == .view_change or self.status == .recovering_head or
|
|
5622
|
+
self.status == .normal);
|
|
5913
5623
|
|
|
5914
5624
|
log.debug("{}: view_jump: waiting to jump to newer view", .{self.replica});
|
|
5915
5625
|
}
|
|
@@ -5924,8 +5634,10 @@ pub fn ReplicaType(
|
|
|
5924
5634
|
});
|
|
5925
5635
|
},
|
|
5926
5636
|
.view_change => {
|
|
5927
|
-
assert(header.view > self.view);
|
|
5928
|
-
assert(self.status
|
|
5637
|
+
assert(self.status == .recovering_head or header.view > self.view);
|
|
5638
|
+
assert(self.status != .recovering_head or header.command == .start_view);
|
|
5639
|
+
assert(self.status == .recovering_head or self.status == .view_change or
|
|
5640
|
+
self.status == .normal);
|
|
5929
5641
|
|
|
5930
5642
|
if (header.view == self.view + 1) {
|
|
5931
5643
|
log.debug("{}: view_jump: jumping to view change", .{self.replica});
|
|
@@ -5944,10 +5656,10 @@ pub fn ReplicaType(
|
|
|
5944
5656
|
assert(message.header.view <= self.view);
|
|
5945
5657
|
assert(message.header.op <= self.op);
|
|
5946
5658
|
|
|
5947
|
-
if (message.header.op == self.op_checkpoint) {
|
|
5659
|
+
if (message.header.op == self.op_checkpoint()) {
|
|
5948
5660
|
assert(message.header.op == 0);
|
|
5949
5661
|
} else {
|
|
5950
|
-
assert(message.header.op > self.op_checkpoint);
|
|
5662
|
+
assert(message.header.op > self.op_checkpoint());
|
|
5951
5663
|
}
|
|
5952
5664
|
|
|
5953
5665
|
if (!self.journal.has(message.header)) {
|
|
@@ -5968,6 +5680,18 @@ pub fn ReplicaType(
|
|
|
5968
5680
|
return;
|
|
5969
5681
|
}
|
|
5970
5682
|
|
|
5683
|
+
// Criteria for caching:
|
|
5684
|
+
// - The primary does not update the cache since it is (or will be) reconstructing its
|
|
5685
|
+
// pipeline.
|
|
5686
|
+
// - Cache uncommitted ops, since it will avoid a WAL read in the common case.
|
|
5687
|
+
if (self.pipeline == .cache and
|
|
5688
|
+
self.replica != self.primary_index(self.view) and
|
|
5689
|
+
self.commit_min < message.header.op)
|
|
5690
|
+
{
|
|
5691
|
+
const prepare_evicted = self.pipeline.cache.insert(message.ref());
|
|
5692
|
+
if (prepare_evicted) |m| self.message_bus.unref(m);
|
|
5693
|
+
}
|
|
5694
|
+
|
|
5971
5695
|
self.journal.write_prepare(write_prepare_callback, message, trigger);
|
|
5972
5696
|
}
|
|
5973
5697
|
|
|
@@ -5993,3 +5717,832 @@ pub fn ReplicaType(
|
|
|
5993
5717
|
}
|
|
5994
5718
|
};
|
|
5995
5719
|
}
|
|
5720
|
+
|
|
5721
|
+
/// A do-view-change:
|
|
5722
|
+
/// - selects the view's head
|
|
5723
|
+
/// - discards uncommitted ops (to maximize availability in the presence of storage faults)
|
|
5724
|
+
/// - retains all committed ops
|
|
5725
|
+
/// - retains all possibly-committed ops (because they might be committed — we can't tell)
|
|
5726
|
+
/// (Some of these may be discarded during repair, via the nack protocol).
|
|
5727
|
+
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
5728
|
+
///
|
|
5729
|
+
/// Terminology:
|
|
5730
|
+
///
|
|
5731
|
+
/// - The *head* message (of a view) is the message (committed or uncommitted) within that view with
|
|
5732
|
+
/// the highest op.
|
|
5733
|
+
///
|
|
5734
|
+
/// - *gap*: There is a header for op X and X+n (n>1), but no header at op X+1.
|
|
5735
|
+
/// - *break*/*chain break*: The header for op X is not the parent of the header for op X+1.
|
|
5736
|
+
/// - *fork*: A correctness bug in which a committed (or possibly committed) message is discarded.
|
|
5737
|
+
///
|
|
5738
|
+
/// - An *uncanonical* message may have been removed/changed during a prior view.
|
|
5739
|
+
/// - A *canonical* message was part of the most recent log_view.
|
|
5740
|
+
/// - Canonical messages do not necessarily survive into the new view, but they take
|
|
5741
|
+
/// precedence over uncanonical messages.
|
|
5742
|
+
/// - Canonical messages may be committed or uncommitted.
|
|
5743
|
+
///
|
|
5744
|
+
/// - *DVC* refers to a command=do_view_change message.
|
|
5745
|
+
/// - *SV* refers to a command=start_view message.
|
|
5746
|
+
/// - The *pipeline suffix* is the last pipeline_prepare_queue_max messages of the log (counting
|
|
5747
|
+
/// backwards from the head op). For example, when pipeline_prepare_queue_max=3,
|
|
5748
|
+
///
|
|
5749
|
+
/// - the pipeline suffix of log "1,2,3,4,5" is "3,4,5".
|
|
5750
|
+
/// - the pipeline suffix of log "1,2,3,5" is "3,5".
|
|
5751
|
+
///
|
|
5752
|
+
///
|
|
5753
|
+
/// Invariants:
|
|
5754
|
+
///
|
|
5755
|
+
/// For each DVC message:
|
|
5756
|
+
///
|
|
5757
|
+
/// - The headers all belong to the same hash chain.
|
|
5758
|
+
/// - Reason: If multiple replicas with the same canonical log_view disagree about an op, the new
|
|
5759
|
+
/// primary could not determine which is correct.
|
|
5760
|
+
/// - Gaps are permitted, but the DVC-sender is responsible for ensuring they do not conceal
|
|
5761
|
+
/// chain breaks.
|
|
5762
|
+
/// - For example,
|
|
5763
|
+
/// - a DVC of 6a,8a is valid (6a/8a belong to the same chain).
|
|
5764
|
+
/// - a DVC of 6b,8a is invalid (the gap at 7 conceal a chain break).
|
|
5765
|
+
/// - a DVC of 6b,7b,8a is invalid (7b/8a is a chain break)..
|
|
5766
|
+
///
|
|
5767
|
+
/// - The headers must connect to the cluster's committed ops (the "DVC anchor").
|
|
5768
|
+
/// This means that either:
|
|
5769
|
+
/// - the DVC includes the op=C header, or
|
|
5770
|
+
/// - the DVC includes the op=C+1 header (where C+1's parent is C).
|
|
5771
|
+
/// (Where `C = "DVC anchor" = max(replica.commit_min, replica.op -| pipeline_prepare_queue_max)`).
|
|
5772
|
+
/// - Reason: The new primary may truncate the entire pipeline (6-9) due to a gap (6),
|
|
5773
|
+
/// but afterwards it still requires a head op to repair/chain backward from.
|
|
5774
|
+
/// (According to the intersection property, a gap in the pipeline indicates an
|
|
5775
|
+
/// uncommitted op).
|
|
5776
|
+
/// - For example, given pipeline_prepare_queue_max=3:
|
|
5777
|
+
/// - a DVC of 7,8 is invalid if replica.commit_min=5.
|
|
5778
|
+
/// - a DVC of 7,8 is valid if replica.commit_min=6.
|
|
5779
|
+
/// - a DVC of 5,7,8 is valid. (5,_,7,8)
|
|
5780
|
+
/// - a DVC of 5,8 is valid. (5,_,_,8)
|
|
5781
|
+
/// - a DVC of 0,1 is valid.
|
|
5782
|
+
///
|
|
5783
|
+
/// Across all DVCs in the quorum:
|
|
5784
|
+
///
|
|
5785
|
+
/// - The headers of every DVC with the same log_view must not conflict.
|
|
5786
|
+
/// - In other words:
|
|
5787
|
+
/// dvc₁.headers[i].op == dvc₂.headers[j].op implies
|
|
5788
|
+
/// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
|
|
5789
|
+
/// - Reason: the headers bundled with the DVC(s) with the highest log_view will be
|
|
5790
|
+
/// loaded into the new primary with `replace_header()`, not `repair_header()`.
|
|
5791
|
+
///
|
|
5792
|
+
/// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
|
|
5793
|
+
/// (e.g. the write is still queued). The header is either:
|
|
5794
|
+
///
|
|
5795
|
+
/// - committed — so another replica in the quorum must have a copy, according to the quorum
|
|
5796
|
+
/// intersection property. Or,
|
|
5797
|
+
/// - uncommitted — if the header is chosen, but cannot be recovered from any replica, then
|
|
5798
|
+
/// it will be discarded by the nack protocol.
|
|
5799
|
+
///
|
|
5800
|
+
///
|
|
5801
|
+
/// Examples
|
|
5802
|
+
///
|
|
5803
|
+
/// In these examples:
|
|
5804
|
+
/// - pipeline_prepare_queue_max=3
|
|
5805
|
+
/// - Brackets denote the suffix of the replica's log that is actually included in the DVC headers.
|
|
5806
|
+
/// - Parenthesis denote a replica that did not participate in the DVC (for example, because it is
|
|
5807
|
+
/// partitioned).
|
|
5808
|
+
///
|
|
5809
|
+
/// Example 1: No gap in canonical headers
|
|
5810
|
+
///
|
|
5811
|
+
/// Consider a view change with DVCs:
|
|
5812
|
+
///
|
|
5813
|
+
/// replica headers log_view
|
|
5814
|
+
/// 0 1 [2 3 4b] 4 (new primary)
|
|
5815
|
+
/// 1 1 2 3 4a 5 6 [7 8 9] 5
|
|
5816
|
+
/// 2 (1 2 3 4a 5 6 7 8 9) 5 (partitioned)
|
|
5817
|
+
///
|
|
5818
|
+
/// Replica 1's headers are canonical, so replica 0 constructs the log:
|
|
5819
|
+
///
|
|
5820
|
+
/// 1 2 3 4b 7 8 9
|
|
5821
|
+
///
|
|
5822
|
+
/// The 5/6 gap conceals a hash break — 4b should be 4a.
|
|
5823
|
+
/// The view must initially keeps all of these headers, and after the DVC quorum is handled, repairs
|
|
5824
|
+
/// backwards from 7. (If it instead discarded at the gap (5…9), the log would fork (4a→4b).)
|
|
5825
|
+
///
|
|
5826
|
+
///
|
|
5827
|
+
/// Example 2: Gap in pipeline suffix
|
|
5828
|
+
///
|
|
5829
|
+
/// Consider a set of replicas performing a DVC:
|
|
5830
|
+
///
|
|
5831
|
+
/// replica headers log_view
|
|
5832
|
+
/// 0 1 [2 3 4b] 4 (new primary)
|
|
5833
|
+
/// 1 1 2 3 4a 5 6 8 9 5
|
|
5834
|
+
/// 2 (1 2 3 ? ? ? ? ? ?) 5 (partitioned)
|
|
5835
|
+
///
|
|
5836
|
+
/// Which headers should replica 1 include in its DVC?
|
|
5837
|
+
/// The cases are be distinguished by `log_view % replica_count`.
|
|
5838
|
+
///
|
|
5839
|
+
/// (These examples are still applicable if the gap is not in the first op of the pipeline suffix).
|
|
5840
|
+
///
|
|
5841
|
+
///
|
|
5842
|
+
/// Example 2a: Gap in the pipeline suffix of a retired primary
|
|
5843
|
+
///
|
|
5844
|
+
/// The replica was a primary during its retired log_view.
|
|
5845
|
+
/// It may have gaps or breaks in its pipeline suffix iff:
|
|
5846
|
+
/// - it didn't finish repairs before the next view change, and
|
|
5847
|
+
/// - some uncommitted ops were truncated during the DVC (since this "moves" the suffix backwards).
|
|
5848
|
+
///
|
|
5849
|
+
/// We cannot send op 6 in the DVC because if repairs did not complete, it may be the wrong message.
|
|
5850
|
+
///
|
|
5851
|
+
/// However, even though we may not have a full unbroken suffix of pipeline_prepare_queue_max
|
|
5852
|
+
/// messages, we know that our unbroken suffix (however long it may be) includes all
|
|
5853
|
+
/// possibly-committed messages, since otherwise the retired log_view would not have started.
|
|
5854
|
+
///
|
|
5855
|
+
/// Therefore, the retired primary sends a DVC with only the unbroken log suffix:
|
|
5856
|
+
///
|
|
5857
|
+
/// replica headers
|
|
5858
|
+
/// 1 1 2 3 4a 5 6 [8 9] (retired primary)
|
|
5859
|
+
///
|
|
5860
|
+
///
|
|
5861
|
+
/// Example 2b: Gap in the pipeline suffix of a retired follower
|
|
5862
|
+
///
|
|
5863
|
+
/// The replica was a follower during its retired log_view.
|
|
5864
|
+
/// Followers always load a full suffix of headers from the view's SV message.
|
|
5865
|
+
/// If there is now a gap in it the follower's suffix, this must be due to missed prepares.
|
|
5866
|
+
///
|
|
5867
|
+
/// Therefore, ops to the left of the gap (where the gap is within the suffix) are part of the
|
|
5868
|
+
/// suffix's hash chain, even though we cannot test this by chaining checksum/parent.
|
|
5869
|
+
///
|
|
5870
|
+
/// Therefore, the retired follower sends the DVC:
|
|
5871
|
+
///
|
|
5872
|
+
/// replica headers
|
|
5873
|
+
/// 1 1 2 3 4a 5 [6 8 9] (retired follower)
|
|
5874
|
+
///
|
|
5875
|
+
///
|
|
5876
|
+
/// Example 3: Break in pipeline suffix
|
|
5877
|
+
///
|
|
5878
|
+
/// Consider a set of replicas performing a DVC:
|
|
5879
|
+
///
|
|
5880
|
+
/// replica headers log_view
|
|
5881
|
+
/// 0 1 [2 3 4b] 4 (new primary)
|
|
5882
|
+
/// 1 1 2 3 4b 5a 6a 7a [8b 9b] 5
|
|
5883
|
+
/// 2 (1 2 3 4b 5b 7b 7b 8b 9b) 5 (partitioned)
|
|
5884
|
+
///
|
|
5885
|
+
/// (Note the chain break at replica 1's 7a/8b.)
|
|
5886
|
+
/// This scenario is exactly analogous to Example 2, except that it can only occur on a retired
|
|
5887
|
+
/// primary, never a retired follower.
|
|
5888
|
+
///
|
|
5889
|
+
/// The retired primary sends a DVC with only the unbroken log suffix:
|
|
5890
|
+
///
|
|
5891
|
+
/// replica headers
|
|
5892
|
+
/// 1 1 2 3 4a 5 6 7a [8 9] (retired primary)
|
|
5893
|
+
///
|
|
5894
|
+
///
|
|
5895
|
+
/// Example 4: Gap in retiring primary suffix after recovery
|
|
5896
|
+
///
|
|
5897
|
+
/// Suppose that replica 1 starts a view as the primary of view 4, with the suffix:
|
|
5898
|
+
///
|
|
5899
|
+
/// log_view 4
|
|
5900
|
+
/// view 4
|
|
5901
|
+
/// journal 1 2 3
|
|
5902
|
+
/// head 3
|
|
5903
|
+
///
|
|
5904
|
+
/// During this view, it prepares several ops:
|
|
5905
|
+
///
|
|
5906
|
+
/// log_view 4
|
|
5907
|
+
/// view 4
|
|
5908
|
+
/// journal 1 2 3 4 5 6 7
|
|
5909
|
+
/// head 7
|
|
5910
|
+
///
|
|
5911
|
+
/// However, the WAL writes are reordered — ops 4,5,7 writes finish before op=6's write has begun:
|
|
5912
|
+
///
|
|
5913
|
+
/// log_view 4
|
|
5914
|
+
/// view 4
|
|
5915
|
+
/// journal 1 2 3 4 5 6 7
|
|
5916
|
+
/// wal 1 2 3 4 5 _ 7
|
|
5917
|
+
/// head 7
|
|
5918
|
+
///
|
|
5919
|
+
/// Replica 1 crashes and recovers, and immediately begins sending a DVC for view=5.
|
|
5920
|
+
/// Under normal circumstances, the retired primary cannot distinguish between a gap and a break
|
|
5921
|
+
/// due to the possibility that its did not complete repair (see Example 2a).
|
|
5922
|
+
/// In this instance though, the gap is safe to skip over because it is to the right of the durable
|
|
5923
|
+
/// SV's head (op=3).
|
|
5924
|
+
///
|
|
5925
|
+
/// log_view 4
|
|
5926
|
+
/// view 5
|
|
5927
|
+
/// journal 1 2 3 [4 5 _ 7]
|
|
5928
|
+
/// head 7
|
|
5929
|
+
///
|
|
5930
|
+
const DVCQuorum = struct {
|
|
5931
|
+
const DVCArray = std.BoundedArray(*const Message, constants.replicas_max);
|
|
5932
|
+
|
|
5933
|
+
fn verify(dvc_quorum: QuorumMessages) void {
|
|
5934
|
+
const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
|
|
5935
|
+
assert(dvcs.len >= 2);
|
|
5936
|
+
for (dvcs.constSlice()) |message| verify_message(message);
|
|
5937
|
+
|
|
5938
|
+
var log_views_all = std.BoundedArray(u32, constants.replicas_max){ .buffer = undefined };
|
|
5939
|
+
for (dvcs.constSlice()) |message| {
|
|
5940
|
+
const log_view = @intCast(u32, message.header.timestamp);
|
|
5941
|
+
if (std.mem.count(u32, log_views_all.constSlice(), &.{log_view}) == 0) {
|
|
5942
|
+
log_views_all.appendAssumeCapacity(log_view);
|
|
5943
|
+
}
|
|
5944
|
+
}
|
|
5945
|
+
|
|
5946
|
+
// Verify that DVCs with the same log_view do not conflict.
|
|
5947
|
+
for (log_views_all.constSlice()) |log_view| {
|
|
5948
|
+
const view_dvcs = dvcs_with_log_view(dvc_quorum, log_view);
|
|
5949
|
+
var view_headers = HeaderIterator.init(view_dvcs, null);
|
|
5950
|
+
while (view_headers.next()) |_| {}
|
|
5951
|
+
}
|
|
5952
|
+
}
|
|
5953
|
+
|
|
5954
|
+
fn verify_message(message: *const Message) void {
|
|
5955
|
+
assert(message.header.command == .do_view_change);
|
|
5956
|
+
assert(message.header.op >= message.header.commit);
|
|
5957
|
+
assert(message.header.op - message.header.commit <= constants.journal_slot_count);
|
|
5958
|
+
|
|
5959
|
+
// The log_view:
|
|
5960
|
+
// * may be higher than the view in any of the prepare headers.
|
|
5961
|
+
// * must be lower than the view of this view change.
|
|
5962
|
+
const log_view = @intCast(u32, message.header.timestamp);
|
|
5963
|
+
assert(log_view < message.header.view);
|
|
5964
|
+
|
|
5965
|
+
// Ignore the headers, but perform the validation.
|
|
5966
|
+
_ = message_body_as_headers_chain_disjoint(message);
|
|
5967
|
+
}
|
|
5968
|
+
|
|
5969
|
+
fn dvcs_all(dvc_quorum: QuorumMessages) DVCArray {
|
|
5970
|
+
var array = DVCArray{ .buffer = undefined };
|
|
5971
|
+
for (dvc_quorum) |received, replica| {
|
|
5972
|
+
if (received) |message| {
|
|
5973
|
+
assert(message.header.command == .do_view_change);
|
|
5974
|
+
assert(message.header.replica == replica);
|
|
5975
|
+
|
|
5976
|
+
array.appendAssumeCapacity(message);
|
|
5977
|
+
}
|
|
5978
|
+
}
|
|
5979
|
+
return array;
|
|
5980
|
+
}
|
|
5981
|
+
|
|
5982
|
+
fn dvcs_canonical(dvc_quorum: QuorumMessages) DVCArray {
|
|
5983
|
+
return dvcs_with_log_view(dvc_quorum, DVCQuorum.log_view_max(dvc_quorum));
|
|
5984
|
+
}
|
|
5985
|
+
|
|
5986
|
+
fn dvcs_with_log_view(dvc_quorum: QuorumMessages, log_view: u32) DVCArray {
|
|
5987
|
+
var array = DVCArray{ .buffer = undefined };
|
|
5988
|
+
const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
|
|
5989
|
+
for (dvcs.constSlice()) |message| {
|
|
5990
|
+
const message_log_view = @intCast(u32, message.header.timestamp);
|
|
5991
|
+
if (message_log_view == log_view) {
|
|
5992
|
+
array.appendAssumeCapacity(message);
|
|
5993
|
+
}
|
|
5994
|
+
}
|
|
5995
|
+
return array;
|
|
5996
|
+
}
|
|
5997
|
+
|
|
5998
|
+
fn dvcs_uncanonical(dvc_quorum: QuorumMessages) DVCArray {
|
|
5999
|
+
const log_view_max_ = DVCQuorum.log_view_max(dvc_quorum);
|
|
6000
|
+
var array = DVCArray{ .buffer = undefined };
|
|
6001
|
+
const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
|
|
6002
|
+
for (dvcs.constSlice()) |message| {
|
|
6003
|
+
const log_view = @intCast(u32, message.header.timestamp);
|
|
6004
|
+
assert(log_view <= log_view_max_);
|
|
6005
|
+
|
|
6006
|
+
if (log_view < log_view_max_) {
|
|
6007
|
+
array.appendAssumeCapacity(message);
|
|
6008
|
+
}
|
|
6009
|
+
}
|
|
6010
|
+
return array;
|
|
6011
|
+
}
|
|
6012
|
+
|
|
6013
|
+
/// Returns the highest `log_view` of any DVC.
|
|
6014
|
+
///
|
|
6015
|
+
/// The headers bundled with DVCs with the highest `log_view` are canonical, since
|
|
6016
|
+
/// the replica has knowledge of previous view changes in which headers were replaced.
|
|
6017
|
+
fn log_view_max(dvc_quorum: QuorumMessages) u32 {
|
|
6018
|
+
var log_view_max_: ?u32 = null;
|
|
6019
|
+
const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
|
|
6020
|
+
for (dvcs.constSlice()) |message| {
|
|
6021
|
+
// The view when this replica was last in normal status, which:
|
|
6022
|
+
// * may be higher than the view in any of the prepare headers.
|
|
6023
|
+
// * must be lower than the view of this view change.
|
|
6024
|
+
const log_view = @intCast(u32, message.header.timestamp);
|
|
6025
|
+
assert(log_view < message.header.view);
|
|
6026
|
+
|
|
6027
|
+
if (log_view_max_ == null or log_view_max_.? < log_view) {
|
|
6028
|
+
log_view_max_ = log_view;
|
|
6029
|
+
}
|
|
6030
|
+
}
|
|
6031
|
+
return log_view_max_.?;
|
|
6032
|
+
}
|
|
6033
|
+
|
|
6034
|
+
/// Returns the highest `commit_min` from any DVC (this is not a `commit_max`).
|
|
6035
|
+
fn commit_min_max(dvc_quorum: QuorumMessages, local: struct {
|
|
6036
|
+
replica: u64,
|
|
6037
|
+
commit_min: u64,
|
|
6038
|
+
}) u64 {
|
|
6039
|
+
assert(dvc_quorum[local.replica].?.header.commit <= local.commit_min);
|
|
6040
|
+
|
|
6041
|
+
var commit_min_max_: ?u64 = null;
|
|
6042
|
+
const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
|
|
6043
|
+
for (dvcs.constSlice()) |message| {
|
|
6044
|
+
if (commit_min_max_ == null or commit_min_max_.? < message.header.commit) {
|
|
6045
|
+
commit_min_max_ = message.header.commit;
|
|
6046
|
+
}
|
|
6047
|
+
}
|
|
6048
|
+
|
|
6049
|
+
// Consider the case:
|
|
6050
|
+
// 1. Start committing op=N…M.
|
|
6051
|
+
// 2. Send `do_view_change` to self.
|
|
6052
|
+
// 3. Finish committing op=N…M.
|
|
6053
|
+
// 4. Remaining `do_view_change` messages arrive, completing the quorum.
|
|
6054
|
+
// In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
|
|
6055
|
+
// Don't let the commit backtrack.
|
|
6056
|
+
if (commit_min_max_.? < local.commit_min) {
|
|
6057
|
+
const dvc_old = dvc_quorum[local.replica].?;
|
|
6058
|
+
assert(dvc_old.header.commit < local.commit_min);
|
|
6059
|
+
assert(dvc_old.header.commit <= commit_min_max_.?);
|
|
6060
|
+
|
|
6061
|
+
log.debug("{}: on_do_view_change: bump commit_min commit={}..{}", .{
|
|
6062
|
+
local.replica,
|
|
6063
|
+
commit_min_max_.?,
|
|
6064
|
+
local.commit_min,
|
|
6065
|
+
});
|
|
6066
|
+
commit_min_max_ = local.commit_min;
|
|
6067
|
+
}
|
|
6068
|
+
|
|
6069
|
+
assert(commit_min_max_.? >= local.commit_min);
|
|
6070
|
+
return commit_min_max_.?;
|
|
6071
|
+
}
|
|
6072
|
+
|
|
6073
|
+
/// Returns the highest `timestamp` from any replica.
|
|
6074
|
+
fn timestamp_max(dvc_quorum: QuorumMessages) u64 {
|
|
6075
|
+
var timestamp_max_: ?u64 = null;
|
|
6076
|
+
const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
|
|
6077
|
+
for (dvcs.constSlice()) |message| {
|
|
6078
|
+
const message_headers = message_body_as_headers_chain_disjoint(message);
|
|
6079
|
+
if (timestamp_max_ == null or timestamp_max_.? < message_headers[0].timestamp) {
|
|
6080
|
+
timestamp_max_ = message_headers[0].timestamp;
|
|
6081
|
+
}
|
|
6082
|
+
}
|
|
6083
|
+
return timestamp_max_.?;
|
|
6084
|
+
}
|
|
6085
|
+
|
|
6086
|
+
fn op_max_canonical(dvc_quorum: QuorumMessages) u64 {
|
|
6087
|
+
var op_max: ?u64 = null;
|
|
6088
|
+
const dvcs = DVCQuorum.dvcs_canonical(dvc_quorum);
|
|
6089
|
+
for (dvcs.constSlice()) |message| {
|
|
6090
|
+
if (op_max == null or op_max.? < message.header.op) {
|
|
6091
|
+
op_max = message.header.op;
|
|
6092
|
+
}
|
|
6093
|
+
}
|
|
6094
|
+
return op_max.?;
|
|
6095
|
+
}
|
|
6096
|
+
|
|
6097
|
+
/// Return an iterator over the canonical DVC's headers, from high-to-low op.
|
|
6098
|
+
/// The first header returned is the new head message.
|
|
6099
|
+
fn headers_canonical(dvc_quorum: QuorumMessages) HeaderIterator {
|
|
6100
|
+
const dvcs = DVCQuorum.dvcs_canonical(dvc_quorum);
|
|
6101
|
+
|
|
6102
|
+
const op_head_max = op_max_canonical(dvc_quorum);
|
|
6103
|
+
// The number of uncommitted ops cannot be more than the length of the pipeline.
|
|
6104
|
+
const op_suffix_min = op_head_max -| constants.pipeline_prepare_queue_max;
|
|
6105
|
+
assert(op_suffix_min <= op_head_max);
|
|
6106
|
+
|
|
6107
|
+
var op_head_min = op_suffix_min;
|
|
6108
|
+
var ops_in_suffix = std.StaticBitSet(constants.pipeline_prepare_queue_max).initEmpty();
|
|
6109
|
+
for (dvcs.constSlice()) |message| {
|
|
6110
|
+
const message_headers = message_body_as_headers_chain_disjoint(message);
|
|
6111
|
+
for (message_headers) |header| {
|
|
6112
|
+
if (header.op > op_suffix_min) {
|
|
6113
|
+
ops_in_suffix.set((header.op - op_suffix_min) - 1);
|
|
6114
|
+
}
|
|
6115
|
+
}
|
|
6116
|
+
op_head_min = std.math.max(op_head_min, message_headers[message_headers.len - 1].op);
|
|
6117
|
+
}
|
|
6118
|
+
assert(op_head_max == 0 or ops_in_suffix.isSet((op_head_max - op_suffix_min) - 1));
|
|
6119
|
+
assert(op_head_min >= op_suffix_min);
|
|
6120
|
+
assert(op_head_min <= op_head_max);
|
|
6121
|
+
|
|
6122
|
+
const op_head = blk: {
|
|
6123
|
+
var op = op_head_min + 1;
|
|
6124
|
+
while (op < op_head_max) : (op += 1) {
|
|
6125
|
+
if (!ops_in_suffix.isSet((op - op_suffix_min) - 1)) {
|
|
6126
|
+
break :blk op - 1;
|
|
6127
|
+
}
|
|
6128
|
+
} else {
|
|
6129
|
+
break :blk op_head_max;
|
|
6130
|
+
}
|
|
6131
|
+
};
|
|
6132
|
+
assert(op_head >= op_head_min);
|
|
6133
|
+
assert(op_head <= op_head_max);
|
|
6134
|
+
|
|
6135
|
+
return HeaderIterator.init(dvcs, op_head);
|
|
6136
|
+
}
|
|
6137
|
+
|
|
6138
|
+
/// Iterate the headers of a set of (same-log_view) DVCs, from high-to-low op.
|
|
6139
|
+
const HeaderIterator = struct {
|
|
6140
|
+
dvcs: DVCArray,
|
|
6141
|
+
dvcs_offsets: std.BoundedArray(usize, constants.replicas_max),
|
|
6142
|
+
|
|
6143
|
+
child: ?struct {
|
|
6144
|
+
op: u64,
|
|
6145
|
+
parent: u128,
|
|
6146
|
+
} = null,
|
|
6147
|
+
|
|
6148
|
+
fn init(dvcs: DVCArray, op_head: ?u64) HeaderIterator {
|
|
6149
|
+
assert(dvcs.len > 0);
|
|
6150
|
+
|
|
6151
|
+
var dvcs_log_view: ?u32 = null;
|
|
6152
|
+
for (dvcs.constSlice()) |message| {
|
|
6153
|
+
const log_view = @intCast(u32, message.header.timestamp);
|
|
6154
|
+
if (dvcs_log_view) |view| {
|
|
6155
|
+
assert(view == log_view);
|
|
6156
|
+
} else {
|
|
6157
|
+
dvcs_log_view = log_view;
|
|
6158
|
+
}
|
|
6159
|
+
}
|
|
6160
|
+
|
|
6161
|
+
var dvcs_offsets = std.BoundedArray(usize, constants.replicas_max){
|
|
6162
|
+
.buffer = undefined,
|
|
6163
|
+
};
|
|
6164
|
+
|
|
6165
|
+
if (op_head) |op_head_| {
|
|
6166
|
+
// Skip over discarded headers.
|
|
6167
|
+
for (dvcs.constSlice()) |message| {
|
|
6168
|
+
const offset = for (message_body_as_headers_chain_disjoint(message)) |header, i| {
|
|
6169
|
+
if (header.op <= op_head_) break i;
|
|
6170
|
+
} else 0;
|
|
6171
|
+
dvcs_offsets.appendAssumeCapacity(offset);
|
|
6172
|
+
}
|
|
6173
|
+
} else {
|
|
6174
|
+
for (dvcs.constSlice()) |_| dvcs_offsets.appendAssumeCapacity(0);
|
|
6175
|
+
}
|
|
6176
|
+
assert(dvcs.len == dvcs_offsets.len);
|
|
6177
|
+
|
|
6178
|
+
return .{
|
|
6179
|
+
.dvcs = dvcs,
|
|
6180
|
+
.dvcs_offsets = dvcs_offsets,
|
|
6181
|
+
};
|
|
6182
|
+
}
|
|
6183
|
+
|
|
6184
|
+
fn next(iterator: *HeaderIterator) ?Header {
|
|
6185
|
+
const ReplicaSet = std.StaticBitSet(constants.replicas_max);
|
|
6186
|
+
var next_header: ?*const Header = null;
|
|
6187
|
+
var next_advance = ReplicaSet.initEmpty();
|
|
6188
|
+
|
|
6189
|
+
for (iterator.dvcs.constSlice()) |message, i| {
|
|
6190
|
+
const message_headers = message_body_as_headers_chain_disjoint(message);
|
|
6191
|
+
const message_headers_offset = iterator.dvcs_offsets.get(i);
|
|
6192
|
+
if (message_headers_offset == message_headers.len) continue;
|
|
6193
|
+
|
|
6194
|
+
const header = &message_headers[message_headers_offset];
|
|
6195
|
+
if (next_header == null or
|
|
6196
|
+
next_header.?.op < header.op)
|
|
6197
|
+
{
|
|
6198
|
+
next_header = header;
|
|
6199
|
+
next_advance = ReplicaSet.initEmpty();
|
|
6200
|
+
}
|
|
6201
|
+
assert((next_header.?.op == header.op) ==
|
|
6202
|
+
(next_header.?.checksum == header.checksum));
|
|
6203
|
+
|
|
6204
|
+
if (next_header.?.op == header.op) {
|
|
6205
|
+
next_advance.set(i);
|
|
6206
|
+
}
|
|
6207
|
+
}
|
|
6208
|
+
assert((next_advance.count() == 0) == (next_header == null));
|
|
6209
|
+
|
|
6210
|
+
var next_advance_iterator = next_advance.iterator(.{});
|
|
6211
|
+
while (next_advance_iterator.next()) |i| {
|
|
6212
|
+
iterator.dvcs_offsets.slice()[i] += 1;
|
|
6213
|
+
}
|
|
6214
|
+
|
|
6215
|
+
if (next_header) |header| {
|
|
6216
|
+
if (iterator.child) |child| {
|
|
6217
|
+
assert(child.op > header.op);
|
|
6218
|
+
assert((child.op == header.op + 1) == (child.parent == header.checksum));
|
|
6219
|
+
}
|
|
6220
|
+
iterator.child = .{ .op = header.op, .parent = header.parent };
|
|
6221
|
+
return header.*;
|
|
6222
|
+
} else {
|
|
6223
|
+
return null;
|
|
6224
|
+
}
|
|
6225
|
+
}
|
|
6226
|
+
};
|
|
6227
|
+
};
|
|
6228
|
+
|
|
6229
|
+
/// Asserts that the headers are in descending op order.
|
|
6230
|
+
/// The headers may contain gaps and/or breaks.
|
|
6231
|
+
fn message_body_as_headers(message: *const Message) []const Header {
|
|
6232
|
+
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
6233
|
+
assert(message.header.command == .do_view_change or
|
|
6234
|
+
message.header.command == .start_view or
|
|
6235
|
+
message.header.command == .headers);
|
|
6236
|
+
|
|
6237
|
+
const headers = std.mem.bytesAsSlice(
|
|
6238
|
+
Header,
|
|
6239
|
+
message.buffer[@sizeOf(Header)..message.header.size],
|
|
6240
|
+
);
|
|
6241
|
+
|
|
6242
|
+
var child: ?*const Header = null;
|
|
6243
|
+
for (headers) |*header| {
|
|
6244
|
+
assert(!constants.verify or header.valid_checksum());
|
|
6245
|
+
assert(header.cluster == message.header.cluster);
|
|
6246
|
+
assert(header.command == .prepare);
|
|
6247
|
+
assert(header.view <= message.header.view);
|
|
6248
|
+
|
|
6249
|
+
if (child) |child_header| {
|
|
6250
|
+
// Headers must be provided in reverse order for the sake of `repair_header()`.
|
|
6251
|
+
// Otherwise, headers may never be repaired where the hash chain never connects.
|
|
6252
|
+
assert(header.op < child_header.op);
|
|
6253
|
+
}
|
|
6254
|
+
child = header;
|
|
6255
|
+
}
|
|
6256
|
+
|
|
6257
|
+
return headers;
|
|
6258
|
+
}
|
|
6259
|
+
|
|
6260
|
+
/// Asserts that the headers are in descending op order, and there are no breaks.
|
|
6261
|
+
/// The headers may contain gaps.
|
|
6262
|
+
fn message_body_as_headers_chain_disjoint(message: *const Message) []const Header {
|
|
6263
|
+
assert(message.header.command == .do_view_change or message.header.command == .start_view);
|
|
6264
|
+
|
|
6265
|
+
const message_headers = message_body_as_headers(message);
|
|
6266
|
+
assert(message_headers.len > 0);
|
|
6267
|
+
assert(message_headers[0].op == message.header.op);
|
|
6268
|
+
|
|
6269
|
+
var child: ?*const Header = null;
|
|
6270
|
+
for (message_headers) |*header| {
|
|
6271
|
+
assert(header.op <= message.header.op);
|
|
6272
|
+
|
|
6273
|
+
if (child) |child_header| {
|
|
6274
|
+
assert(header.view <= child_header.view);
|
|
6275
|
+
assert((header.op + 1 == child_header.op) == (header.checksum == child_header.parent));
|
|
6276
|
+
assert(header.timestamp < child_header.timestamp);
|
|
6277
|
+
}
|
|
6278
|
+
child = header;
|
|
6279
|
+
}
|
|
6280
|
+
return message_headers;
|
|
6281
|
+
}
|
|
6282
|
+
|
|
6283
|
+
/// Asserts that the headers are in descending op order, and there are no gaps or breaks.
|
|
6284
|
+
fn message_body_as_headers_chain_consecutive(message: *const Message) []const Header {
|
|
6285
|
+
assert(message.header.command == .start_view);
|
|
6286
|
+
|
|
6287
|
+
const message_headers = message_body_as_headers_chain_disjoint(message);
|
|
6288
|
+
var child: ?*const Header = null;
|
|
6289
|
+
for (message_headers) |*header| {
|
|
6290
|
+
if (child) |child_header| {
|
|
6291
|
+
assert(header.op + 1 == child_header.op);
|
|
6292
|
+
assert(header.checksum == child_header.parent);
|
|
6293
|
+
}
|
|
6294
|
+
child = header;
|
|
6295
|
+
}
|
|
6296
|
+
return message_headers;
|
|
6297
|
+
}
|
|
6298
|
+
|
|
6299
|
+
/// The PipelineQueue belongs to a normal-status primary. It consists of two queues:
|
|
6300
|
+
/// - A prepare queue, containing all messages currently being prepared.
|
|
6301
|
+
/// - A request queue, containing all messages which are waiting to begin preparing.
|
|
6302
|
+
///
|
|
6303
|
+
/// Invariants:
|
|
6304
|
+
/// - prepare_queue contains only messages with command=prepare.
|
|
6305
|
+
/// - prepare_queue's messages have sequential, increasing ops.
|
|
6306
|
+
/// - prepare_queue's messages are hash-chained.
|
|
6307
|
+
/// - request_queue contains only messages with command=request.
|
|
6308
|
+
/// - If request_queue is not empty, then prepare_queue is full OR 1-less than full.
|
|
6309
|
+
/// (The caller is responsible for maintaining this invariant. If the caller removes an entry
|
|
6310
|
+
/// from `prepare_queue`, an entry from request_queue should be moved over promptly.)
|
|
6311
|
+
///
|
|
6312
|
+
/// Note: The prepare queue may contain multiple prepares from a single client, but the request
|
|
6313
|
+
/// queue may not (see message_by_client()).
|
|
6314
|
+
const PipelineQueue = struct {
|
|
6315
|
+
const PrepareQueue = RingBuffer(Prepare, constants.pipeline_prepare_queue_max, .array);
|
|
6316
|
+
const RequestQueue = RingBuffer(Request, constants.pipeline_request_queue_max, .array);
|
|
6317
|
+
|
|
6318
|
+
/// Messages that are preparing (uncommitted, being written to the WAL (may already be written
|
|
6319
|
+
/// to the WAL) and replicated (may just be waiting for acks)).
|
|
6320
|
+
prepare_queue: PrepareQueue = .{},
|
|
6321
|
+
/// Messages that are accepted from the client, but not yet preparing.
|
|
6322
|
+
/// When `pipeline_prepare_queue_max + pipeline_request_queue_max = clients_max`, the request
|
|
6323
|
+
/// queue guards against clients starving one another.
|
|
6324
|
+
request_queue: RequestQueue = .{},
|
|
6325
|
+
|
|
6326
|
+
fn deinit(pipeline: *PipelineQueue, message_pool: *MessagePool) void {
|
|
6327
|
+
while (pipeline.request_queue.pop()) |r| message_pool.unref(r.message);
|
|
6328
|
+
while (pipeline.prepare_queue.pop()) |p| message_pool.unref(p.message);
|
|
6329
|
+
}
|
|
6330
|
+
|
|
6331
|
+
fn verify(pipeline: PipelineQueue) void {
|
|
6332
|
+
assert(pipeline.request_queue.count <= constants.pipeline_request_queue_max);
|
|
6333
|
+
assert(pipeline.prepare_queue.count <= constants.pipeline_prepare_queue_max);
|
|
6334
|
+
assert(pipeline.request_queue.empty() or
|
|
6335
|
+
constants.pipeline_prepare_queue_max == pipeline.prepare_queue.count or
|
|
6336
|
+
constants.pipeline_prepare_queue_max == pipeline.prepare_queue.count + 1);
|
|
6337
|
+
|
|
6338
|
+
if (pipeline.prepare_queue.head_ptr_const()) |head| {
|
|
6339
|
+
var op = head.message.header.op;
|
|
6340
|
+
var parent = head.message.header.parent;
|
|
6341
|
+
var prepare_iterator = pipeline.prepare_queue.iterator();
|
|
6342
|
+
while (prepare_iterator.next_ptr()) |prepare| {
|
|
6343
|
+
assert(prepare.message.header.command == .prepare);
|
|
6344
|
+
assert(prepare.message.header.op == op);
|
|
6345
|
+
assert(prepare.message.header.parent == parent);
|
|
6346
|
+
|
|
6347
|
+
parent = prepare.message.header.checksum;
|
|
6348
|
+
op += 1;
|
|
6349
|
+
}
|
|
6350
|
+
}
|
|
6351
|
+
|
|
6352
|
+
var request_iterator = pipeline.request_queue.iterator();
|
|
6353
|
+
while (request_iterator.next()) |request| {
|
|
6354
|
+
assert(request.message.header.command == .request);
|
|
6355
|
+
}
|
|
6356
|
+
}
|
|
6357
|
+
|
|
6358
|
+
fn full(pipeline: PipelineQueue) bool {
|
|
6359
|
+
if (pipeline.prepare_queue.full()) {
|
|
6360
|
+
return pipeline.request_queue.full();
|
|
6361
|
+
} else {
|
|
6362
|
+
assert(pipeline.request_queue.empty() or
|
|
6363
|
+
pipeline.prepare_queue.count + 1 == constants.pipeline_prepare_queue_max);
|
|
6364
|
+
return false;
|
|
6365
|
+
}
|
|
6366
|
+
}
|
|
6367
|
+
|
|
6368
|
+
/// Searches the pipeline for a prepare for a given op and checksum.
|
|
6369
|
+
/// When `checksum` is `null`, match any checksum.
|
|
6370
|
+
fn prepare_by_op_and_checksum(pipeline: *PipelineQueue, op: u64, checksum: ?u128) ?*Prepare {
|
|
6371
|
+
if (pipeline.prepare_queue.empty()) return null;
|
|
6372
|
+
|
|
6373
|
+
// To optimize the search, we can leverage the fact that the pipeline's entries are
|
|
6374
|
+
// ordered and consecutive.
|
|
6375
|
+
const head_op = pipeline.prepare_queue.head_ptr().?.message.header.op;
|
|
6376
|
+
const tail_op = pipeline.prepare_queue.tail_ptr().?.message.header.op;
|
|
6377
|
+
if (op < head_op) return null;
|
|
6378
|
+
if (op > tail_op) return null;
|
|
6379
|
+
|
|
6380
|
+
const prepare = pipeline.prepare_queue.get_ptr(op - head_op).?;
|
|
6381
|
+
assert(prepare.message.header.op == op);
|
|
6382
|
+
|
|
6383
|
+
if (checksum == null) return prepare;
|
|
6384
|
+
if (checksum.? == prepare.message.header.checksum) return prepare;
|
|
6385
|
+
return null;
|
|
6386
|
+
}
|
|
6387
|
+
|
|
6388
|
+
/// Searches the pipeline for a prepare matching the given ack.
|
|
6389
|
+
/// Asserts that the returned prepare corresponds to the prepare_ok.
|
|
6390
|
+
fn prepare_by_prepare_ok(pipeline: *PipelineQueue, ok: *const Message) ?*Prepare {
|
|
6391
|
+
assert(ok.header.command == .prepare_ok);
|
|
6392
|
+
|
|
6393
|
+
const prepare = pipeline.prepare_by_op_and_checksum(
|
|
6394
|
+
ok.header.op,
|
|
6395
|
+
ok.header.context,
|
|
6396
|
+
) orelse return null;
|
|
6397
|
+
assert(prepare.message.header.command == .prepare);
|
|
6398
|
+
assert(prepare.message.header.parent == ok.header.parent);
|
|
6399
|
+
assert(prepare.message.header.client == ok.header.client);
|
|
6400
|
+
assert(prepare.message.header.request == ok.header.request);
|
|
6401
|
+
assert(prepare.message.header.cluster == ok.header.cluster);
|
|
6402
|
+
assert(prepare.message.header.epoch == ok.header.epoch);
|
|
6403
|
+
// A prepare may be committed in the same view or in a newer view:
|
|
6404
|
+
assert(prepare.message.header.view <= ok.header.view);
|
|
6405
|
+
assert(prepare.message.header.op == ok.header.op);
|
|
6406
|
+
assert(prepare.message.header.commit == ok.header.commit);
|
|
6407
|
+
assert(prepare.message.header.timestamp == ok.header.timestamp);
|
|
6408
|
+
assert(prepare.message.header.operation == ok.header.operation);
|
|
6409
|
+
|
|
6410
|
+
return prepare;
|
|
6411
|
+
}
|
|
6412
|
+
|
|
6413
|
+
/// Search the pipeline (both request & prepare queues) for a message from the given client.
|
|
6414
|
+
/// - A client may have multiple prepares in the pipeline if these were committed by the
|
|
6415
|
+
/// previous primary and were reloaded into the pipeline after a view change.
|
|
6416
|
+
/// - A client may have at most one request in the pipeline.
|
|
6417
|
+
/// If there are multiple messages in the pipeline from the client, the *latest* message is
|
|
6418
|
+
/// returned (to help the caller identify bad client behavior).
|
|
6419
|
+
fn message_by_client(pipeline: PipelineQueue, client_id: u128) ?*const Message {
|
|
6420
|
+
var message: ?*const Message = null;
|
|
6421
|
+
var prepare_iterator = pipeline.prepare_queue.iterator();
|
|
6422
|
+
while (prepare_iterator.next_ptr()) |prepare| {
|
|
6423
|
+
if (prepare.message.header.client == client_id) message = prepare.message;
|
|
6424
|
+
}
|
|
6425
|
+
|
|
6426
|
+
var request_iterator = pipeline.request_queue.iterator();
|
|
6427
|
+
while (request_iterator.next()) |request| {
|
|
6428
|
+
if (request.message.header.client == client_id) message = request.message;
|
|
6429
|
+
}
|
|
6430
|
+
return message;
|
|
6431
|
+
}
|
|
6432
|
+
|
|
6433
|
+
/// Warning: This temporarily violates the prepare/request queue count invariant.
|
|
6434
|
+
/// After invocation, call pop_request→push_prepare to begin preparing the next request.
|
|
6435
|
+
fn pop_prepare(pipeline: *PipelineQueue) ?Prepare {
|
|
6436
|
+
if (pipeline.prepare_queue.pop()) |prepare| {
|
|
6437
|
+
assert(pipeline.request_queue.empty() or
|
|
6438
|
+
pipeline.prepare_queue.count + 1 == constants.pipeline_prepare_queue_max);
|
|
6439
|
+
return prepare;
|
|
6440
|
+
} else {
|
|
6441
|
+
assert(pipeline.request_queue.empty());
|
|
6442
|
+
return null;
|
|
6443
|
+
}
|
|
6444
|
+
}
|
|
6445
|
+
|
|
6446
|
+
fn pop_request(pipeline: *PipelineQueue) ?Request {
|
|
6447
|
+
return pipeline.request_queue.pop();
|
|
6448
|
+
}
|
|
6449
|
+
|
|
6450
|
+
fn push_request(pipeline: *PipelineQueue, request: Request) void {
|
|
6451
|
+
assert(request.message.header.command == .request);
|
|
6452
|
+
var queue_iterator = pipeline.request_queue.iterator();
|
|
6453
|
+
while (queue_iterator.next()) |queue_request| {
|
|
6454
|
+
assert(queue_request.message.header.client != request.message.header.client);
|
|
6455
|
+
}
|
|
6456
|
+
|
|
6457
|
+
pipeline.request_queue.push_assume_capacity(request);
|
|
6458
|
+
if (constants.verify) pipeline.verify();
|
|
6459
|
+
}
|
|
6460
|
+
|
|
6461
|
+
fn push_prepare(pipeline: *PipelineQueue, message: *Message) void {
|
|
6462
|
+
assert(message.header.command == .prepare);
|
|
6463
|
+
if (pipeline.prepare_queue.tail()) |tail| {
|
|
6464
|
+
assert(message.header.op == tail.message.header.op + 1);
|
|
6465
|
+
assert(message.header.parent == tail.message.header.checksum);
|
|
6466
|
+
assert(message.header.view >= tail.message.header.view);
|
|
6467
|
+
} else {
|
|
6468
|
+
assert(pipeline.request_queue.empty());
|
|
6469
|
+
}
|
|
6470
|
+
|
|
6471
|
+
pipeline.prepare_queue.push_assume_capacity(.{ .message = message });
|
|
6472
|
+
if (constants.verify) pipeline.verify();
|
|
6473
|
+
}
|
|
6474
|
+
};
|
|
6475
|
+
|
|
6476
|
+
/// Prepares in the cache may be committed or uncommitted, and may not belong to the current view.
|
|
6477
|
+
///
|
|
6478
|
+
/// Invariants:
|
|
6479
|
+
/// - The cache contains only messages with command=prepare.
|
|
6480
|
+
/// - If a message with op X is in the cache, it is in `prepares[X % prepares.len]`.
|
|
6481
|
+
const PipelineCache = struct {
|
|
6482
|
+
const prepares_max =
|
|
6483
|
+
constants.pipeline_prepare_queue_max +
|
|
6484
|
+
constants.pipeline_request_queue_max;
|
|
6485
|
+
|
|
6486
|
+
prepares: [prepares_max]?*Message = [_]?*Message{null} ** prepares_max,
|
|
6487
|
+
|
|
6488
|
+
/// Converting a PipelineQueue to a PipelineCache discards all accumulated acks.
|
|
6489
|
+
/// "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
|
|
6490
|
+
/// after a cycle of view changes. In other words, when a view change cycles around, so
|
|
6491
|
+
/// that the original primary becomes a primary of a new view, pipeline entries may be
|
|
6492
|
+
/// reused. However, the pipeline's prepare_ok quorums must not be reused, since the
|
|
6493
|
+
/// replicas that sent them may have swapped them out during a previous view change.
|
|
6494
|
+
fn init_from_queue(queue: *PipelineQueue) PipelineCache {
|
|
6495
|
+
var cache = PipelineCache{};
|
|
6496
|
+
var prepares = queue.prepare_queue.iterator();
|
|
6497
|
+
while (prepares.next()) |prepare| {
|
|
6498
|
+
const prepare_evicted = cache.insert(prepare.message.ref());
|
|
6499
|
+
assert(prepare_evicted == null);
|
|
6500
|
+
assert(prepare.message.header.command == .prepare);
|
|
6501
|
+
}
|
|
6502
|
+
return cache;
|
|
6503
|
+
}
|
|
6504
|
+
|
|
6505
|
+
fn deinit(pipeline: *PipelineCache, message_pool: *MessagePool) void {
|
|
6506
|
+
for (pipeline.prepares) |*entry| {
|
|
6507
|
+
if (entry.*) |m| {
|
|
6508
|
+
message_pool.unref(m);
|
|
6509
|
+
entry.* = null;
|
|
6510
|
+
}
|
|
6511
|
+
}
|
|
6512
|
+
}
|
|
6513
|
+
|
|
6514
|
+
fn empty(pipeline: *const PipelineCache) bool {
|
|
6515
|
+
for (pipeline.prepares) |*entry| {
|
|
6516
|
+
if (entry) |_| return true;
|
|
6517
|
+
}
|
|
6518
|
+
return false;
|
|
6519
|
+
}
|
|
6520
|
+
|
|
6521
|
+
fn contains_header(pipeline: *const PipelineCache, header: *const Header) bool {
|
|
6522
|
+
assert(header.command == .prepare);
|
|
6523
|
+
|
|
6524
|
+
const slot = header.op % prepares_max;
|
|
6525
|
+
const prepare = pipeline.prepares[slot] orelse return false;
|
|
6526
|
+
return prepare.header.op == header.op and prepare.header.checksum == header.checksum;
|
|
6527
|
+
}
|
|
6528
|
+
|
|
6529
|
+
/// Unlike the PipelineQueue, cached messages may not belong to the current view.
|
|
6530
|
+
/// Thus, a matching checksum is required.
|
|
6531
|
+
fn prepare_by_op_and_checksum(pipeline: *PipelineCache, op: u64, checksum: u128) ?*Message {
|
|
6532
|
+
const slot = op % prepares_max;
|
|
6533
|
+
const prepare = pipeline.prepares[slot] orelse return null;
|
|
6534
|
+
if (prepare.header.op != op) return null;
|
|
6535
|
+
if (prepare.header.checksum != checksum) return null;
|
|
6536
|
+
return prepare;
|
|
6537
|
+
}
|
|
6538
|
+
|
|
6539
|
+
/// Returns the message evicted from the cache, if any.
|
|
6540
|
+
fn insert(pipeline: *PipelineCache, prepare: *Message) ?*Message {
|
|
6541
|
+
assert(prepare.header.command == .prepare);
|
|
6542
|
+
|
|
6543
|
+
const slot = prepare.header.op % prepares_max;
|
|
6544
|
+
const prepare_evicted = pipeline.prepares[slot];
|
|
6545
|
+
pipeline.prepares[slot] = prepare;
|
|
6546
|
+
return prepare_evicted;
|
|
6547
|
+
}
|
|
6548
|
+
};
|