tigerbeetle-node 0.11.0 → 0.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -0
- package/package.json +5 -3
- package/src/tigerbeetle/scripts/fuzz_loop.sh +1 -1
- package/src/tigerbeetle/scripts/pre-commit.sh +2 -2
- package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
- package/src/tigerbeetle/src/benchmark.zig +25 -11
- package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
- package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
- package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -256
- package/src/tigerbeetle/src/c/tb_client.h +18 -4
- package/src/tigerbeetle/src/c/tb_client.zig +88 -26
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
- package/src/tigerbeetle/src/c/test.zig +371 -1
- package/src/tigerbeetle/src/cli.zig +90 -18
- package/src/tigerbeetle/src/config.zig +12 -4
- package/src/tigerbeetle/src/demo.zig +2 -1
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
- package/src/tigerbeetle/src/ewah.zig +11 -33
- package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
- package/src/tigerbeetle/src/lsm/README.md +97 -3
- package/src/tigerbeetle/src/lsm/compaction.zig +32 -7
- package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +34 -32
- package/src/tigerbeetle/src/lsm/grid.zig +39 -21
- package/src/tigerbeetle/src/lsm/groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +3 -3
- package/src/tigerbeetle/src/lsm/level_iterator.zig +1 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +13 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -49
- package/src/tigerbeetle/src/lsm/manifest_log.zig +173 -335
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +24 -15
- package/src/tigerbeetle/src/lsm/table.zig +32 -20
- package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/table_iterator.zig +4 -5
- package/src/tigerbeetle/src/lsm/test.zig +13 -2
- package/src/tigerbeetle/src/lsm/tree.zig +45 -7
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +36 -32
- package/src/tigerbeetle/src/main.zig +69 -13
- package/src/tigerbeetle/src/message_bus.zig +18 -7
- package/src/tigerbeetle/src/message_pool.zig +8 -2
- package/src/tigerbeetle/src/ring_buffer.zig +7 -3
- package/src/tigerbeetle/src/simulator.zig +38 -11
- package/src/tigerbeetle/src/state_machine.zig +48 -23
- package/src/tigerbeetle/src/test/accounting/workload.zig +9 -5
- package/src/tigerbeetle/src/test/cluster.zig +15 -33
- package/src/tigerbeetle/src/test/conductor.zig +2 -1
- package/src/tigerbeetle/src/test/network.zig +45 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
- package/src/tigerbeetle/src/test/state_checker.zig +5 -7
- package/src/tigerbeetle/src/test/storage.zig +453 -110
- package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +1 -0
- package/src/tigerbeetle/src/unit_tests.zig +7 -1
- package/src/tigerbeetle/src/util.zig +97 -11
- package/src/tigerbeetle/src/vopr.zig +2 -1
- package/src/tigerbeetle/src/vsr/client.zig +8 -3
- package/src/tigerbeetle/src/vsr/journal.zig +280 -202
- package/src/tigerbeetle/src/vsr/replica.zig +169 -31
- package/src/tigerbeetle/src/vsr/superblock.zig +356 -629
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -6
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +414 -151
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +44 -9
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
- package/src/tigerbeetle/src/vsr.zig +19 -5
- package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
- package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
- package/src/tigerbeetle/src/vopr_hub/README.md +0 -58
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +0 -199
- package/src/tigerbeetle/src/vopr_hub/go.mod +0 -3
- package/src/tigerbeetle/src/vopr_hub/main.go +0 -1022
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +0 -3
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +0 -403
|
@@ -10,7 +10,8 @@ const MessagePool = @import("../message_pool.zig").MessagePool;
|
|
|
10
10
|
const Message = @import("../message_pool.zig").MessagePool.Message;
|
|
11
11
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
12
12
|
const ClientTable = @import("superblock_client_table.zig").ClientTable;
|
|
13
|
-
const
|
|
13
|
+
const format_wal_headers = @import("./journal.zig").format_wal_headers;
|
|
14
|
+
const format_wal_prepares = @import("./journal.zig").format_wal_prepares;
|
|
14
15
|
|
|
15
16
|
const vsr = @import("../vsr.zig");
|
|
16
17
|
const Header = vsr.Header;
|
|
@@ -148,8 +149,12 @@ pub fn ReplicaType(
|
|
|
148
149
|
/// Invariants (not applicable during status=recovering):
|
|
149
150
|
/// * `replica.op` exists in the Journal.
|
|
150
151
|
/// * `replica.op ≥ replica.commit_min`.
|
|
151
|
-
/// * `replica.op
|
|
152
|
-
///
|
|
152
|
+
/// * `replica.op - replica.commit_min ≤ journal_slot_count`
|
|
153
|
+
/// * `replica.op - replica.op_checkpoint ≤ journal_slot_count`
|
|
154
|
+
/// It is safe to overwrite `op_checkpoint` itself.
|
|
155
|
+
/// * `replica.op ≤ replica.op_checkpoint_trigger`:
|
|
156
|
+
/// Don't wrap the WAL until we are sure that the overwritten entry will not be required
|
|
157
|
+
/// for recovery.
|
|
153
158
|
// TODO: When recovery protocol is removed, load the `op` from the WAL, and verify that it is ≥op_checkpoint.
|
|
154
159
|
// Also verify that a corresponding header exists in the WAL.
|
|
155
160
|
op: u64,
|
|
@@ -162,10 +167,10 @@ pub fn ReplicaType(
|
|
|
162
167
|
/// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
|
|
163
168
|
///
|
|
164
169
|
/// Invariants (not applicable during status=recovering):
|
|
165
|
-
/// * `replica.commit_min` exists in the Journal.
|
|
170
|
+
/// * `replica.commit_min` exists in the Journal OR `replica.commit_min == op_checkpoint`.
|
|
166
171
|
/// * `replica.commit_min ≤ replica.op`
|
|
167
172
|
/// * `replica.commit_min ≥ replica.op_checkpoint`.
|
|
168
|
-
/// * never decreases
|
|
173
|
+
/// * never decreases while the replica is alive
|
|
169
174
|
commit_min: u64,
|
|
170
175
|
|
|
171
176
|
/// The op number of the latest committed operation (according to the cluster):
|
|
@@ -263,7 +268,17 @@ pub fn ReplicaType(
|
|
|
263
268
|
/// Seeded with the replica's index number.
|
|
264
269
|
prng: std.rand.DefaultPrng,
|
|
265
270
|
|
|
266
|
-
|
|
271
|
+
/// Simulator hooks.
|
|
272
|
+
on_change_state: ?fn (replica: *const Self) void = null,
|
|
273
|
+
/// Called immediately after a compaction.
|
|
274
|
+
on_compact: ?fn (replica: *const Self) void = null,
|
|
275
|
+
/// Called immediately after a checkpoint.
|
|
276
|
+
/// Note: The replica may checkpoint without calling this function:
|
|
277
|
+
/// 1. Begin checkpoint.
|
|
278
|
+
/// 2. Write 2/4 SuperBlock copies.
|
|
279
|
+
/// 3. Crash.
|
|
280
|
+
/// 4. Recover in the new checkpoint (but op_checkpoint wasn't called).
|
|
281
|
+
on_checkpoint: ?fn (replica: *const Self) void = null,
|
|
267
282
|
|
|
268
283
|
/// Called when `commit_prepare` finishes committing.
|
|
269
284
|
commit_callback: ?fn (*Self) void = null,
|
|
@@ -984,7 +999,11 @@ pub fn ReplicaType(
|
|
|
984
999
|
// const threshold = self.quorum_replication;
|
|
985
1000
|
// TODO: When Block recover & state transfer are implemented, this can be removed.
|
|
986
1001
|
const threshold =
|
|
987
|
-
if (prepare.message.header.op == self.op_checkpoint_trigger()
|
|
1002
|
+
if (prepare.message.header.op == self.op_checkpoint_trigger() or
|
|
1003
|
+
prepare.message.header.op == self.op_checkpoint + config.lsm_batch_multiple + 1)
|
|
1004
|
+
self.replica_count
|
|
1005
|
+
else
|
|
1006
|
+
self.quorum_replication;
|
|
988
1007
|
|
|
989
1008
|
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
990
1009
|
&prepare.ok_from_all_replicas,
|
|
@@ -2560,6 +2579,9 @@ pub fn ReplicaType(
|
|
|
2560
2579
|
|
|
2561
2580
|
const op = self.commit_prepare.?.header.op;
|
|
2562
2581
|
assert(op == self.commit_min);
|
|
2582
|
+
assert(op <= self.op_checkpoint_trigger());
|
|
2583
|
+
|
|
2584
|
+
if (self.on_compact) |on_compact| on_compact(self);
|
|
2563
2585
|
|
|
2564
2586
|
if (op == self.op_checkpoint_trigger()) {
|
|
2565
2587
|
assert(op == self.op);
|
|
@@ -2573,7 +2595,6 @@ pub fn ReplicaType(
|
|
|
2573
2595
|
});
|
|
2574
2596
|
self.state_machine.checkpoint(commit_op_checkpoint_state_machine_callback);
|
|
2575
2597
|
} else {
|
|
2576
|
-
assert(op < self.op_checkpoint_trigger());
|
|
2577
2598
|
self.commit_op_done();
|
|
2578
2599
|
}
|
|
2579
2600
|
}
|
|
@@ -2596,18 +2617,20 @@ pub fn ReplicaType(
|
|
|
2596
2617
|
// They will only be compacted to disk in the next bar.
|
|
2597
2618
|
// Therefore, only ops "A..D" are committed to disk.
|
|
2598
2619
|
// Thus, the SuperBlock's `commit_min` is set to 7-2=5.
|
|
2620
|
+
const vsr_state_commit_min = self.op_checkpoint_next();
|
|
2599
2621
|
const vsr_state_new = .{
|
|
2600
|
-
.
|
|
2622
|
+
.commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
|
|
2623
|
+
.commit_min = vsr_state_commit_min,
|
|
2601
2624
|
.commit_max = self.commit_max,
|
|
2602
2625
|
.view_normal = self.view_normal,
|
|
2603
2626
|
.view = self.view,
|
|
2604
2627
|
};
|
|
2605
|
-
assert(
|
|
2628
|
+
assert(self.superblock.working.vsr_state.monotonic(vsr_state_new));
|
|
2606
2629
|
|
|
2607
|
-
self.superblock.staging.vsr_state = vsr_state_new;
|
|
2608
2630
|
self.superblock.checkpoint(
|
|
2609
2631
|
commit_op_checkpoint_superblock_callback,
|
|
2610
2632
|
&self.superblock_context,
|
|
2633
|
+
vsr_state_new,
|
|
2611
2634
|
);
|
|
2612
2635
|
}
|
|
2613
2636
|
|
|
@@ -2629,6 +2652,7 @@ pub fn ReplicaType(
|
|
|
2629
2652
|
self.op_checkpoint,
|
|
2630
2653
|
});
|
|
2631
2654
|
|
|
2655
|
+
if (self.on_checkpoint) |on_checkpoint| on_checkpoint(self);
|
|
2632
2656
|
self.commit_op_done();
|
|
2633
2657
|
}
|
|
2634
2658
|
|
|
@@ -2662,10 +2686,17 @@ pub fn ReplicaType(
|
|
|
2662
2686
|
// this commit.
|
|
2663
2687
|
|
|
2664
2688
|
assert(self.journal.has(prepare.header));
|
|
2665
|
-
|
|
2666
|
-
|
|
2689
|
+
if (self.op_checkpoint == self.commit_min) {
|
|
2690
|
+
// op_checkpoint's slot may have been overwritten in the WAL — but we can
|
|
2691
|
+
// always use the VSRState to anchor the hash chain.
|
|
2692
|
+
assert(prepare.header.parent ==
|
|
2693
|
+
self.superblock.working.vsr_state.commit_min_checksum);
|
|
2694
|
+
} else {
|
|
2695
|
+
assert(prepare.header.parent ==
|
|
2696
|
+
self.journal.header_with_op(self.commit_min).?.checksum);
|
|
2697
|
+
}
|
|
2667
2698
|
|
|
2668
|
-
log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
|
|
2699
|
+
log.debug("{}: commit_op: executing view={} leader={} op={} checksum={} ({s})", .{
|
|
2669
2700
|
self.replica,
|
|
2670
2701
|
self.view,
|
|
2671
2702
|
self.leader_index(self.view) == self.replica,
|
|
@@ -3282,7 +3313,7 @@ pub fn ReplicaType(
|
|
|
3282
3313
|
// Verify that the new request will fit in the WAL.
|
|
3283
3314
|
// The message's op hasn't been assigned yet, but it will be `self.op + 1`.
|
|
3284
3315
|
if (self.op == self.op_checkpoint_trigger()) {
|
|
3285
|
-
log.debug("{}: on_request: ignoring op={} (too far ahead,
|
|
3316
|
+
log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint_trigger={})", .{
|
|
3286
3317
|
self.replica,
|
|
3287
3318
|
self.op + 1,
|
|
3288
3319
|
self.op_checkpoint,
|
|
@@ -3852,8 +3883,6 @@ pub fn ReplicaType(
|
|
|
3852
3883
|
assert(self.op_checkpoint <= self.commit_min);
|
|
3853
3884
|
assert(self.commit_min <= self.op);
|
|
3854
3885
|
assert(self.commit_min <= self.commit_max);
|
|
3855
|
-
|
|
3856
|
-
assert(self.journal.header_with_op(self.commit_min) != null);
|
|
3857
3886
|
assert(self.journal.header_with_op(self.op) != null);
|
|
3858
3887
|
|
|
3859
3888
|
// The replica repairs backwards from `commit_max`. But if `commit_max` is too high
|
|
@@ -3926,6 +3955,8 @@ pub fn ReplicaType(
|
|
|
3926
3955
|
}
|
|
3927
3956
|
|
|
3928
3957
|
// Assert that all headers are now present and connected with a perfect hash chain:
|
|
3958
|
+
// TODO(State Transfer): This may fail if the commit max is too far ahead and we
|
|
3959
|
+
// couldn't repair it without jumping ahead on the WAL.
|
|
3929
3960
|
assert(self.op >= self.commit_max);
|
|
3930
3961
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
3931
3962
|
|
|
@@ -4014,8 +4045,9 @@ pub fn ReplicaType(
|
|
|
4014
4045
|
if (header.op == 0 and self.op_checkpoint == 0) {
|
|
4015
4046
|
// Repairing the root op is allowed until the first checkpoint.
|
|
4016
4047
|
} else {
|
|
4017
|
-
//
|
|
4018
|
-
// the next wrap of the
|
|
4048
|
+
// It is critical that we do not repair checkpointed ops; their slots now belong
|
|
4049
|
+
// to the next wrap of the log, and overwriting a new op with an old op is a
|
|
4050
|
+
// correctness violation.
|
|
4019
4051
|
log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
|
|
4020
4052
|
self.replica,
|
|
4021
4053
|
self.op_checkpoint,
|
|
@@ -4311,6 +4343,8 @@ pub fn ReplicaType(
|
|
|
4311
4343
|
assert(self.status == .normal or self.status == .view_change);
|
|
4312
4344
|
assert(self.repairs_allowed());
|
|
4313
4345
|
assert(self.journal.dirty.count > 0);
|
|
4346
|
+
assert(self.op >= self.commit_min);
|
|
4347
|
+
assert(self.op - self.commit_min + 1 <= config.journal_slot_count);
|
|
4314
4348
|
|
|
4315
4349
|
// Request enough prepares to utilize our max IO depth:
|
|
4316
4350
|
var budget = self.journal.writes.available();
|
|
@@ -4322,7 +4356,7 @@ pub fn ReplicaType(
|
|
|
4322
4356
|
if (self.op < config.journal_slot_count) {
|
|
4323
4357
|
// The op is known, and this is the first WAL cycle.
|
|
4324
4358
|
// Therefore, any faulty ops to the right of `replica.op` are corrupt reserved
|
|
4325
|
-
// entries from the initial format.
|
|
4359
|
+
// entries from the initial format, or corrupt prepares which were since truncated.
|
|
4326
4360
|
var op: usize = self.op + 1;
|
|
4327
4361
|
while (op < config.journal_slot_count) : (op += 1) {
|
|
4328
4362
|
const slot = self.journal.slot_for_op(op);
|
|
@@ -4330,23 +4364,56 @@ pub fn ReplicaType(
|
|
|
4330
4364
|
|
|
4331
4365
|
if (self.journal.faulty.bit(slot)) {
|
|
4332
4366
|
assert(self.journal.headers[op].command == .reserved);
|
|
4367
|
+
assert(self.journal.headers_redundant[op].command == .reserved);
|
|
4333
4368
|
self.journal.dirty.clear(slot);
|
|
4334
4369
|
self.journal.faulty.clear(slot);
|
|
4335
|
-
log.debug("{}: repair_prepares:
|
|
4370
|
+
log.debug("{}: repair_prepares: remove slot={} " ++
|
|
4371
|
+
"(faulty, op known, first cycle)", .{
|
|
4336
4372
|
self.replica,
|
|
4337
|
-
|
|
4373
|
+
slot.index,
|
|
4338
4374
|
});
|
|
4339
4375
|
}
|
|
4340
4376
|
}
|
|
4341
4377
|
}
|
|
4342
4378
|
|
|
4343
4379
|
var op = self.op + 1;
|
|
4380
|
+
// To maximize durability, repair all prepares for which we have a header (not only
|
|
4381
|
+
// uncommitted headers). This in turn enables the replica to help repair other replicas.
|
|
4344
4382
|
const op_min = op -| config.journal_slot_count;
|
|
4345
4383
|
while (op > op_min) {
|
|
4346
4384
|
op -= 1;
|
|
4347
4385
|
|
|
4348
4386
|
const slot = self.journal.slot_for_op(op);
|
|
4349
4387
|
if (self.journal.dirty.bit(slot)) {
|
|
4388
|
+
if (self.journal.slot_with_op(op) == null) {
|
|
4389
|
+
// If this op was between `commit_min` and `replica.op`, we would have
|
|
4390
|
+
// requested and repaired these headers.
|
|
4391
|
+
// If this op was between `op_checkpoint` and `commit_min`, we would have
|
|
4392
|
+
// a header (since we couldn't have committed otherwise).
|
|
4393
|
+
//
|
|
4394
|
+
// Therefore, this op must be either:
|
|
4395
|
+
// - less-than-or-equal-to `op_checkpoint` — we committed before
|
|
4396
|
+
// checkpointing, but the entry in our WAL was found corrupt after
|
|
4397
|
+
// recovering from a crash.
|
|
4398
|
+
// - or (indistinguishably) this might originally have been an op greater
|
|
4399
|
+
// than replica.op, which was truncated, but is now corrupt.
|
|
4400
|
+
//
|
|
4401
|
+
// we don't try to repair this op because the slot belongs (or will soon
|
|
4402
|
+
// belong) to a newer op, from the new WAL wrap. Additionally, we may not
|
|
4403
|
+
// still have access to its surrounding commits to verify the hash chain.
|
|
4404
|
+
assert(op <= self.commit_min);
|
|
4405
|
+
assert(op <= self.op_checkpoint);
|
|
4406
|
+
assert(self.journal.faulty.bit(slot));
|
|
4407
|
+
|
|
4408
|
+
log.debug("{}: repair_prepares: remove slot={} " ++
|
|
4409
|
+
"(faulty, precedes checkpoint)", .{
|
|
4410
|
+
self.replica,
|
|
4411
|
+
slot.index,
|
|
4412
|
+
});
|
|
4413
|
+
self.journal.remove_entry(slot);
|
|
4414
|
+
continue;
|
|
4415
|
+
}
|
|
4416
|
+
|
|
4350
4417
|
// If this is an uncommitted op, and we are the leader in `view_change` status,
|
|
4351
4418
|
// then we will `request_prepare` from the cluster, set `nack_prepare_op`,
|
|
4352
4419
|
// and stop repairing any further prepares:
|
|
@@ -4941,51 +5008,67 @@ pub fn ReplicaType(
|
|
|
4941
5008
|
assert(!self.do_view_change_quorum);
|
|
4942
5009
|
assert(message.header.view == self.view);
|
|
4943
5010
|
assert(message.header.replica == self.replica);
|
|
5011
|
+
assert(message.header.replica != replica);
|
|
4944
5012
|
},
|
|
4945
5013
|
.view_change => {
|
|
4946
5014
|
assert(self.start_view_change_quorum);
|
|
4947
5015
|
assert(self.do_view_change_quorum);
|
|
4948
5016
|
assert(message.header.view == self.view);
|
|
4949
5017
|
assert(message.header.replica == self.replica);
|
|
5018
|
+
assert(message.header.replica != replica);
|
|
4950
5019
|
},
|
|
4951
5020
|
else => unreachable,
|
|
4952
5021
|
},
|
|
4953
5022
|
.recovery => {
|
|
4954
5023
|
assert(self.status == .recovering);
|
|
4955
5024
|
assert(message.header.replica == self.replica);
|
|
5025
|
+
assert(message.header.replica != replica);
|
|
4956
5026
|
assert(message.header.context == self.recovery_nonce);
|
|
4957
5027
|
},
|
|
4958
5028
|
.recovery_response => {
|
|
4959
5029
|
assert(self.status == .normal);
|
|
4960
5030
|
assert(message.header.view == self.view);
|
|
4961
5031
|
assert(message.header.replica == self.replica);
|
|
5032
|
+
assert(message.header.replica != replica);
|
|
4962
5033
|
},
|
|
4963
5034
|
.headers => {
|
|
4964
5035
|
assert(self.status == .normal or self.status == .view_change);
|
|
4965
5036
|
assert(message.header.view == self.view);
|
|
4966
5037
|
assert(message.header.replica == self.replica);
|
|
5038
|
+
assert(message.header.replica != replica);
|
|
4967
5039
|
},
|
|
4968
5040
|
.ping, .pong => {
|
|
4969
5041
|
assert(message.header.view == self.view);
|
|
4970
5042
|
assert(message.header.replica == self.replica);
|
|
5043
|
+
assert(message.header.replica != replica);
|
|
4971
5044
|
},
|
|
4972
5045
|
.commit => {
|
|
4973
5046
|
assert(self.status == .normal);
|
|
4974
5047
|
assert(self.leader());
|
|
4975
5048
|
assert(message.header.view == self.view);
|
|
4976
5049
|
assert(message.header.replica == self.replica);
|
|
5050
|
+
assert(message.header.replica != replica);
|
|
5051
|
+
},
|
|
5052
|
+
.request_start_view => {
|
|
5053
|
+
assert(message.header.view >= self.view);
|
|
5054
|
+
assert(message.header.replica == self.replica);
|
|
5055
|
+
assert(message.header.replica != replica);
|
|
5056
|
+
assert(self.leader_index(message.header.view) == replica);
|
|
4977
5057
|
},
|
|
4978
5058
|
.request_headers => {
|
|
4979
5059
|
assert(message.header.view == self.view);
|
|
4980
5060
|
assert(message.header.replica == self.replica);
|
|
5061
|
+
assert(message.header.replica != replica);
|
|
4981
5062
|
},
|
|
4982
5063
|
.request_prepare => {
|
|
4983
5064
|
assert(message.header.view == self.view);
|
|
4984
5065
|
assert(message.header.replica == self.replica);
|
|
5066
|
+
assert(message.header.replica != replica);
|
|
4985
5067
|
},
|
|
4986
5068
|
.nack_prepare => {
|
|
4987
5069
|
assert(message.header.view == self.view);
|
|
4988
5070
|
assert(message.header.replica == self.replica);
|
|
5071
|
+
assert(message.header.replica != replica);
|
|
4989
5072
|
assert(self.leader_index(self.view) == replica);
|
|
4990
5073
|
},
|
|
4991
5074
|
else => {
|
|
@@ -5022,6 +5105,7 @@ pub fn ReplicaType(
|
|
|
5022
5105
|
// `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
|
|
5023
5106
|
assert(op >= self.commit_max);
|
|
5024
5107
|
assert(op >= commit_max);
|
|
5108
|
+
// TODO: This assertion may fail until recovery protocol is removed.
|
|
5025
5109
|
assert(op <= self.op_checkpoint_trigger());
|
|
5026
5110
|
|
|
5027
5111
|
// We expect that our commit numbers may also be greater even than `commit_max` because
|
|
@@ -5667,8 +5751,10 @@ pub fn ReplicaType(
|
|
|
5667
5751
|
|
|
5668
5752
|
/// Returns true if all operations are present, correctly ordered and connected by hash
|
|
5669
5753
|
/// chain, between `op_min` and `op_max` (both inclusive).
|
|
5670
|
-
fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
|
|
5754
|
+
fn valid_hash_chain_between(self: *const Self, op_min: u64, op_max: u64) bool {
|
|
5671
5755
|
assert(op_min <= op_max);
|
|
5756
|
+
// Headers with ops preceding the checkpoint may be unavailable due to a WAL wrap.
|
|
5757
|
+
assert(op_min >= self.op_checkpoint);
|
|
5672
5758
|
|
|
5673
5759
|
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
5674
5760
|
// that have since been reordered by a new leader.
|
|
@@ -5679,6 +5765,27 @@ pub fn ReplicaType(
|
|
|
5679
5765
|
while (op > op_min) {
|
|
5680
5766
|
op -= 1;
|
|
5681
5767
|
|
|
5768
|
+
if (self.op_checkpoint == op) {
|
|
5769
|
+
// op_checkpoint's slot may have been overwritten in the WAL — but we can
|
|
5770
|
+
// always use the VSRState to anchor the hash chain.
|
|
5771
|
+
assert(op == op_min);
|
|
5772
|
+
assert(op == self.superblock.working.vsr_state.commit_min);
|
|
5773
|
+
if (self.superblock.working.vsr_state.commit_min_checksum == b.parent) {
|
|
5774
|
+
return true;
|
|
5775
|
+
} else {
|
|
5776
|
+
log.debug("{}: valid_hash_chain_between: break A: {} (checkpoint={})", .{
|
|
5777
|
+
self.replica,
|
|
5778
|
+
self.superblock.working.vsr_state.commit_min_checksum,
|
|
5779
|
+
self.op_checkpoint,
|
|
5780
|
+
});
|
|
5781
|
+
log.debug("{}: valid_hash_chain_between: break B: {}", .{
|
|
5782
|
+
self.replica,
|
|
5783
|
+
b,
|
|
5784
|
+
});
|
|
5785
|
+
return false;
|
|
5786
|
+
}
|
|
5787
|
+
}
|
|
5788
|
+
|
|
5682
5789
|
if (self.journal.header_with_op(op)) |a| {
|
|
5683
5790
|
assert(a.op + 1 == b.op);
|
|
5684
5791
|
if (a.checksum == b.parent) {
|
|
@@ -5912,18 +6019,20 @@ fn ReplicaFormatType(comptime Storage: type) type {
|
|
|
5912
6019
|
);
|
|
5913
6020
|
errdefer allocator.free(wal_buffer);
|
|
5914
6021
|
|
|
5915
|
-
// The logical offset *within the
|
|
6022
|
+
// The logical offset *within the Zone*.
|
|
6023
|
+
// Even though the prepare zone follows the redundant header zone, write the prepares
|
|
6024
|
+
// first. This allows the test Storage to check the invariant "never write the redundant
|
|
6025
|
+
// header before the prepare".
|
|
5916
6026
|
var wal_offset: u64 = 0;
|
|
5917
|
-
while (wal_offset < config.
|
|
5918
|
-
const size =
|
|
5919
|
-
assert(size % config.sector_size == 0);
|
|
6027
|
+
while (wal_offset < config.journal_size_prepares) {
|
|
6028
|
+
const size = format_wal_prepares(cluster, wal_offset, wal_buffer);
|
|
5920
6029
|
assert(size > 0);
|
|
5921
6030
|
|
|
5922
6031
|
for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
|
|
5923
6032
|
if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
|
|
5924
6033
|
// This is the (empty) body of a reserved or root Prepare.
|
|
5925
6034
|
} else {
|
|
5926
|
-
// This is
|
|
6035
|
+
// This is a Prepare's header.
|
|
5927
6036
|
assert(header.valid_checksum());
|
|
5928
6037
|
if (header.op == 0) {
|
|
5929
6038
|
assert(header.command == .prepare);
|
|
@@ -5939,16 +6048,45 @@ fn ReplicaFormatType(comptime Storage: type) type {
|
|
|
5939
6048
|
format_wal_sectors_callback,
|
|
5940
6049
|
&self.wal_write,
|
|
5941
6050
|
wal_buffer[0..size],
|
|
5942
|
-
.
|
|
6051
|
+
.wal_prepares,
|
|
5943
6052
|
wal_offset,
|
|
5944
6053
|
);
|
|
5945
6054
|
self.formatting = true;
|
|
5946
6055
|
while (self.formatting) storage.tick();
|
|
5947
6056
|
wal_offset += size;
|
|
5948
6057
|
}
|
|
6058
|
+
// There are no prepares left to write.
|
|
6059
|
+
assert(format_wal_prepares(cluster, wal_offset, wal_buffer) == 0);
|
|
6060
|
+
|
|
6061
|
+
wal_offset = 0;
|
|
6062
|
+
while (wal_offset < config.journal_size_headers) {
|
|
6063
|
+
const size = format_wal_headers(cluster, wal_offset, wal_buffer);
|
|
6064
|
+
assert(size > 0);
|
|
6065
|
+
|
|
6066
|
+
for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
|
|
6067
|
+
assert(header.valid_checksum());
|
|
6068
|
+
if (header.op == 0) {
|
|
6069
|
+
assert(header.command == .prepare);
|
|
6070
|
+
assert(header.operation == .root);
|
|
6071
|
+
} else {
|
|
6072
|
+
assert(header.command == .reserved);
|
|
6073
|
+
assert(header.operation == .reserved);
|
|
6074
|
+
}
|
|
6075
|
+
}
|
|
5949
6076
|
|
|
5950
|
-
|
|
5951
|
-
|
|
6077
|
+
storage.write_sectors(
|
|
6078
|
+
format_wal_sectors_callback,
|
|
6079
|
+
&self.wal_write,
|
|
6080
|
+
wal_buffer[0..size],
|
|
6081
|
+
.wal_headers,
|
|
6082
|
+
wal_offset,
|
|
6083
|
+
);
|
|
6084
|
+
self.formatting = true;
|
|
6085
|
+
while (self.formatting) storage.tick();
|
|
6086
|
+
wal_offset += size;
|
|
6087
|
+
}
|
|
6088
|
+
// There are no headers left to write.
|
|
6089
|
+
assert(format_wal_headers(cluster, wal_offset, wal_buffer) == 0);
|
|
5952
6090
|
}
|
|
5953
6091
|
|
|
5954
6092
|
fn format_wal_sectors_callback(write: *Storage.Write) void {
|