tigerbeetle-node 0.11.13 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -10
- package/dist/bin/aarch64-linux-gnu/client.node +0 -0
- package/dist/bin/aarch64-linux-musl/client.node +0 -0
- package/dist/bin/aarch64-macos/client.node +0 -0
- package/dist/bin/x86_64-linux-gnu/client.node +0 -0
- package/dist/bin/x86_64-linux-musl/client.node +0 -0
- package/dist/bin/x86_64-macos/client.node +0 -0
- package/dist/index.js +33 -1
- package/dist/index.js.map +1 -1
- package/package-lock.json +66 -0
- package/package.json +6 -16
- package/src/index.ts +56 -1
- package/src/node.zig +9 -9
- package/dist/.client.node.sha256 +0 -1
- package/scripts/build_lib.sh +0 -61
- package/scripts/download_node_headers.sh +0 -32
- package/src/tigerbeetle/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/src/benchmark.zig +0 -336
- package/src/tigerbeetle/src/config.zig +0 -233
- package/src/tigerbeetle/src/constants.zig +0 -428
- package/src/tigerbeetle/src/ewah.zig +0 -286
- package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
- package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
- package/src/tigerbeetle/src/fifo.zig +0 -120
- package/src/tigerbeetle/src/io/benchmark.zig +0 -213
- package/src/tigerbeetle/src/io/darwin.zig +0 -814
- package/src/tigerbeetle/src/io/linux.zig +0 -1071
- package/src/tigerbeetle/src/io/test.zig +0 -643
- package/src/tigerbeetle/src/io/windows.zig +0 -1183
- package/src/tigerbeetle/src/io.zig +0 -34
- package/src/tigerbeetle/src/iops.zig +0 -107
- package/src/tigerbeetle/src/lsm/README.md +0 -308
- package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
- package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
- package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
- package/src/tigerbeetle/src/lsm/direction.zig +0 -11
- package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
- package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
- package/src/tigerbeetle/src/lsm/forest.zig +0 -205
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
- package/src/tigerbeetle/src/lsm/grid.zig +0 -573
- package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
- package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
- package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
- package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
- package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
- package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
- package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
- package/src/tigerbeetle/src/lsm/table.zig +0 -1009
- package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
- package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
- package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
- package/src/tigerbeetle/src/lsm/test.zig +0 -439
- package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
- package/src/tigerbeetle/src/message_bus.zig +0 -1013
- package/src/tigerbeetle/src/message_pool.zig +0 -156
- package/src/tigerbeetle/src/ring_buffer.zig +0 -399
- package/src/tigerbeetle/src/simulator.zig +0 -580
- package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
- package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
- package/src/tigerbeetle/src/state_machine.zig +0 -2099
- package/src/tigerbeetle/src/static_allocator.zig +0 -65
- package/src/tigerbeetle/src/stdx.zig +0 -171
- package/src/tigerbeetle/src/storage.zig +0 -393
- package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
- package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
- package/src/tigerbeetle/src/testing/cluster.zig +0 -444
- package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
- package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
- package/src/tigerbeetle/src/testing/id.zig +0 -99
- package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
- package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
- package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
- package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
- package/src/tigerbeetle/src/testing/storage.zig +0 -757
- package/src/tigerbeetle/src/testing/table.zig +0 -247
- package/src/tigerbeetle/src/testing/time.zig +0 -84
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
- package/src/tigerbeetle/src/time.zig +0 -112
- package/src/tigerbeetle/src/tracer.zig +0 -529
- package/src/tigerbeetle/src/unit_tests.zig +0 -40
- package/src/tigerbeetle/src/vopr.zig +0 -495
- package/src/tigerbeetle/src/vsr/README.md +0 -209
- package/src/tigerbeetle/src/vsr/client.zig +0 -544
- package/src/tigerbeetle/src/vsr/clock.zig +0 -855
- package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
- package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
- package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
- package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
- package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
- package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
- package/src/tigerbeetle/src/vsr.zig +0 -1425
|
@@ -1,757 +0,0 @@
|
|
|
1
|
-
//! In-memory storage, with simulated faults and latency.
|
|
2
|
-
//!
|
|
3
|
-
//!
|
|
4
|
-
//! Fault Injection
|
|
5
|
-
//!
|
|
6
|
-
//! Storage injects faults that the cluster can (i.e. should be able to) recover from.
|
|
7
|
-
//! Each zone can tolerate a different pattern of faults.
|
|
8
|
-
//!
|
|
9
|
-
//! - superblock:
|
|
10
|
-
//! - One read/write fault is permitted per area (section, manifest, …).
|
|
11
|
-
//! - An additional fault is permitted at the target of a pending write during a crash.
|
|
12
|
-
//!
|
|
13
|
-
//! - wal_headers, wal_prepares:
|
|
14
|
-
//! - Read/write faults are distributed between replicas according to ClusterFaultAtlas, to ensure
|
|
15
|
-
//! that at least one replica will have a valid copy to help others repair.
|
|
16
|
-
//! (See: generate_faulty_wal_areas()).
|
|
17
|
-
//! - When a replica crashes, it may fault the WAL outside of ClusterFaultAtlas.
|
|
18
|
-
//! - When replica_count=1, its WAL can only be corrupted by a crash, never a read/write.
|
|
19
|
-
//! (When replica_count=1, there are no other replicas to assist with repair).
|
|
20
|
-
//!
|
|
21
|
-
//! - grid: (TODO: Enable grid faults when grid repair is implemented).
|
|
22
|
-
//!
|
|
23
|
-
const std = @import("std");
|
|
24
|
-
const assert = std.debug.assert;
|
|
25
|
-
const math = std.math;
|
|
26
|
-
const mem = std.mem;
|
|
27
|
-
|
|
28
|
-
const FIFO = @import("../fifo.zig").FIFO;
|
|
29
|
-
const constants = @import("../constants.zig");
|
|
30
|
-
const vsr = @import("../vsr.zig");
|
|
31
|
-
const superblock = @import("../vsr/superblock.zig");
|
|
32
|
-
const BlockType = @import("../lsm/grid.zig").BlockType;
|
|
33
|
-
const stdx = @import("../stdx.zig");
|
|
34
|
-
const PriorityQueue = @import("./priority_queue.zig").PriorityQueue;
|
|
35
|
-
const fuzz = @import("./fuzz.zig");
|
|
36
|
-
const hash_log = @import("./hash_log.zig");
|
|
37
|
-
|
|
38
|
-
const log = std.log.scoped(.storage);
|
|
39
|
-
|
|
40
|
-
// TODOs:
|
|
41
|
-
// less than a majority of replicas may have corruption
|
|
42
|
-
// have an option to enable/disable the following corruption types:
|
|
43
|
-
// bitrot
|
|
44
|
-
// misdirected read/write
|
|
45
|
-
// corrupt sector
|
|
46
|
-
// latent sector error
|
|
47
|
-
// - emulate by zeroing sector, as this is how we handle this in the real Storage implementation
|
|
48
|
-
// - likely that surrounding sectors also corrupt
|
|
49
|
-
// - likely that stuff written at the same time is also corrupt even if written to a far away sector
|
|
50
|
-
pub const Storage = struct {
|
|
51
|
-
/// Options for fault injection during fuzz testing
|
|
52
|
-
pub const Options = struct {
|
|
53
|
-
/// Seed for the storage PRNG.
|
|
54
|
-
seed: u64 = 0,
|
|
55
|
-
|
|
56
|
-
/// Required when `fault_atlas` is set.
|
|
57
|
-
replica_index: ?u8 = null,
|
|
58
|
-
|
|
59
|
-
/// Minimum number of ticks it may take to read data.
|
|
60
|
-
read_latency_min: u64,
|
|
61
|
-
/// Average number of ticks it may take to read data. Must be >= read_latency_min.
|
|
62
|
-
read_latency_mean: u64,
|
|
63
|
-
/// Minimum number of ticks it may take to write data.
|
|
64
|
-
write_latency_min: u64,
|
|
65
|
-
/// Average number of ticks it may take to write data. Must be >= write_latency_min.
|
|
66
|
-
write_latency_mean: u64,
|
|
67
|
-
|
|
68
|
-
/// Chance out of 100 that a read will corrupt a sector, if the target memory is within
|
|
69
|
-
/// a faulty area of this replica.
|
|
70
|
-
read_fault_probability: u8 = 0,
|
|
71
|
-
/// Chance out of 100 that a write will corrupt a sector, if the target memory is within
|
|
72
|
-
/// a faulty area of this replica.
|
|
73
|
-
write_fault_probability: u8 = 0,
|
|
74
|
-
/// Chance out of 100 that a crash will corrupt a sector of a pending write's target,
|
|
75
|
-
/// if the target memory is within a faulty area of this replica.
|
|
76
|
-
crash_fault_probability: u8 = 0,
|
|
77
|
-
|
|
78
|
-
/// Enable/disable automatic read/write faults.
|
|
79
|
-
/// Does not impact crash faults or manual faults.
|
|
80
|
-
fault_atlas: ?*const ClusterFaultAtlas = null,
|
|
81
|
-
};
|
|
82
|
-
|
|
83
|
-
/// See usage in Journal.write_sectors() for details.
|
|
84
|
-
/// TODO: allow testing in both modes.
|
|
85
|
-
pub const synchronicity: enum {
|
|
86
|
-
always_synchronous,
|
|
87
|
-
always_asynchronous,
|
|
88
|
-
} = .always_asynchronous;
|
|
89
|
-
|
|
90
|
-
pub const Read = struct {
|
|
91
|
-
callback: fn (read: *Storage.Read) void,
|
|
92
|
-
buffer: []u8,
|
|
93
|
-
zone: vsr.Zone,
|
|
94
|
-
/// Relative offset within the zone.
|
|
95
|
-
offset: u64,
|
|
96
|
-
/// Tick at which this read is considered "completed" and the callback should be called.
|
|
97
|
-
done_at_tick: u64,
|
|
98
|
-
|
|
99
|
-
fn less_than(context: void, a: *Read, b: *Read) math.Order {
|
|
100
|
-
_ = context;
|
|
101
|
-
|
|
102
|
-
return math.order(a.done_at_tick, b.done_at_tick);
|
|
103
|
-
}
|
|
104
|
-
};
|
|
105
|
-
|
|
106
|
-
pub const Write = struct {
|
|
107
|
-
callback: fn (write: *Storage.Write) void,
|
|
108
|
-
buffer: []const u8,
|
|
109
|
-
zone: vsr.Zone,
|
|
110
|
-
/// Relative offset within the zone.
|
|
111
|
-
offset: u64,
|
|
112
|
-
/// Tick at which this write is considered "completed" and the callback should be called.
|
|
113
|
-
done_at_tick: u64,
|
|
114
|
-
|
|
115
|
-
fn less_than(context: void, a: *Write, b: *Write) math.Order {
|
|
116
|
-
_ = context;
|
|
117
|
-
|
|
118
|
-
return math.order(a.done_at_tick, b.done_at_tick);
|
|
119
|
-
}
|
|
120
|
-
};
|
|
121
|
-
|
|
122
|
-
pub const NextTick = struct {
|
|
123
|
-
next: ?*NextTick = null,
|
|
124
|
-
callback: fn (next_tick: *NextTick) void,
|
|
125
|
-
};
|
|
126
|
-
|
|
127
|
-
allocator: mem.Allocator,
|
|
128
|
-
|
|
129
|
-
size: u64,
|
|
130
|
-
options: Options,
|
|
131
|
-
prng: std.rand.DefaultPrng,
|
|
132
|
-
|
|
133
|
-
memory: []align(constants.sector_size) u8,
|
|
134
|
-
/// Set bits correspond to sectors that have ever been written to.
|
|
135
|
-
memory_written: std.DynamicBitSetUnmanaged,
|
|
136
|
-
/// Set bits correspond to faulty sectors. The underlying sectors of `memory` is left clean.
|
|
137
|
-
faults: std.DynamicBitSetUnmanaged,
|
|
138
|
-
|
|
139
|
-
/// Whether to enable faults (when false, this supersedes `faulty_wal_areas`).
|
|
140
|
-
/// This is used to disable faults during the replica's first startup.
|
|
141
|
-
faulty: bool = true,
|
|
142
|
-
|
|
143
|
-
reads: PriorityQueue(*Storage.Read, void, Storage.Read.less_than),
|
|
144
|
-
writes: PriorityQueue(*Storage.Write, void, Storage.Write.less_than),
|
|
145
|
-
|
|
146
|
-
ticks: u64 = 0,
|
|
147
|
-
next_tick_queue: FIFO(NextTick) = .{},
|
|
148
|
-
|
|
149
|
-
pub fn init(allocator: mem.Allocator, size: u64, options: Storage.Options) !Storage {
|
|
150
|
-
assert(options.write_latency_mean >= options.write_latency_min);
|
|
151
|
-
assert(options.read_latency_mean >= options.read_latency_min);
|
|
152
|
-
assert(options.fault_atlas == null or options.replica_index != null);
|
|
153
|
-
|
|
154
|
-
var prng = std.rand.DefaultPrng.init(options.seed);
|
|
155
|
-
const sector_count = @divExact(size, constants.sector_size);
|
|
156
|
-
const memory = try allocator.allocAdvanced(u8, constants.sector_size, size, .exact);
|
|
157
|
-
errdefer allocator.free(memory);
|
|
158
|
-
// TODO: random data
|
|
159
|
-
mem.set(u8, memory, 0);
|
|
160
|
-
|
|
161
|
-
var memory_written = try std.DynamicBitSetUnmanaged.initEmpty(allocator, sector_count);
|
|
162
|
-
errdefer memory_written.deinit(allocator);
|
|
163
|
-
|
|
164
|
-
var faults = try std.DynamicBitSetUnmanaged.initEmpty(allocator, sector_count);
|
|
165
|
-
errdefer faults.deinit(allocator);
|
|
166
|
-
|
|
167
|
-
var reads = PriorityQueue(*Storage.Read, void, Storage.Read.less_than).init(allocator, {});
|
|
168
|
-
errdefer reads.deinit();
|
|
169
|
-
try reads.ensureTotalCapacity(constants.iops_read_max);
|
|
170
|
-
|
|
171
|
-
var writes = PriorityQueue(*Storage.Write, void, Storage.Write.less_than).init(allocator, {});
|
|
172
|
-
errdefer writes.deinit();
|
|
173
|
-
try writes.ensureTotalCapacity(constants.iops_write_max);
|
|
174
|
-
|
|
175
|
-
return Storage{
|
|
176
|
-
.allocator = allocator,
|
|
177
|
-
.size = size,
|
|
178
|
-
.options = options,
|
|
179
|
-
.prng = prng,
|
|
180
|
-
.memory = memory,
|
|
181
|
-
.memory_written = memory_written,
|
|
182
|
-
.faults = faults,
|
|
183
|
-
.reads = reads,
|
|
184
|
-
.writes = writes,
|
|
185
|
-
};
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
pub fn deinit(storage: *Storage, allocator: mem.Allocator) void {
|
|
189
|
-
allocator.free(storage.memory);
|
|
190
|
-
storage.memory_written.deinit(allocator);
|
|
191
|
-
storage.faults.deinit(allocator);
|
|
192
|
-
storage.reads.deinit();
|
|
193
|
-
storage.writes.deinit();
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/// Cancel any currently in-progress reads/writes.
|
|
197
|
-
/// Corrupt the target sectors of any in-progress writes.
|
|
198
|
-
pub fn reset(storage: *Storage) void {
|
|
199
|
-
while (storage.writes.peek()) |_| {
|
|
200
|
-
const write = storage.writes.remove();
|
|
201
|
-
if (!storage.x_in_100(storage.options.crash_fault_probability)) continue;
|
|
202
|
-
|
|
203
|
-
// Randomly corrupt one of the faulty sectors the operation targeted.
|
|
204
|
-
// TODO: inject more realistic and varied storage faults as described above.
|
|
205
|
-
const sectors = SectorRange.from_zone(write.zone, write.offset, write.buffer.len);
|
|
206
|
-
storage.fault_sector(write.zone, sectors.random(storage.prng.random()));
|
|
207
|
-
}
|
|
208
|
-
assert(storage.writes.len == 0);
|
|
209
|
-
|
|
210
|
-
storage.reads.len = 0;
|
|
211
|
-
storage.next_tick_queue = .{};
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
/// Returns the number of bytes that have been written to, assuming that (the simulated)
|
|
215
|
-
/// `fallocate()` creates a sparse file.
|
|
216
|
-
pub fn size_used(storage: *const Storage) usize {
|
|
217
|
-
return storage.memory_written.count() * constants.sector_size;
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
/// Copy state from `origin` to `storage`:
|
|
221
|
-
///
|
|
222
|
-
/// - ticks
|
|
223
|
-
/// - memory
|
|
224
|
-
/// - occupied memory
|
|
225
|
-
/// - faulty sectors
|
|
226
|
-
/// - reads in-progress
|
|
227
|
-
/// - writes in-progress
|
|
228
|
-
///
|
|
229
|
-
/// Both instances must have an identical size.
|
|
230
|
-
pub fn copy(storage: *Storage, origin: *const Storage) void {
|
|
231
|
-
assert(storage.size == origin.size);
|
|
232
|
-
|
|
233
|
-
storage.ticks = origin.ticks;
|
|
234
|
-
stdx.copy_disjoint(.exact, u8, storage.memory, origin.memory);
|
|
235
|
-
storage.memory_written.toggleSet(storage.memory_written);
|
|
236
|
-
storage.memory_written.toggleSet(origin.memory_written);
|
|
237
|
-
storage.faults.toggleSet(storage.faults);
|
|
238
|
-
storage.faults.toggleSet(origin.faults);
|
|
239
|
-
|
|
240
|
-
storage.reads.len = 0;
|
|
241
|
-
for (origin.reads.items[0..origin.reads.len]) |read| {
|
|
242
|
-
storage.reads.add(read) catch unreachable;
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
storage.writes.len = 0;
|
|
246
|
-
for (origin.writes.items[0..origin.writes.len]) |write| {
|
|
247
|
-
storage.writes.add(write) catch unreachable;
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
pub fn tick(storage: *Storage) void {
|
|
252
|
-
storage.ticks += 1;
|
|
253
|
-
|
|
254
|
-
while (storage.reads.peek()) |read| {
|
|
255
|
-
if (read.done_at_tick > storage.ticks) break;
|
|
256
|
-
_ = storage.reads.remove();
|
|
257
|
-
storage.read_sectors_finish(read);
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
while (storage.writes.peek()) |write| {
|
|
261
|
-
if (write.done_at_tick > storage.ticks) break;
|
|
262
|
-
_ = storage.writes.remove();
|
|
263
|
-
storage.write_sectors_finish(write);
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
while (storage.next_tick_queue.pop()) |next_tick| {
|
|
267
|
-
next_tick.callback(next_tick);
|
|
268
|
-
}
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
pub fn on_next_tick(
|
|
272
|
-
storage: *Storage,
|
|
273
|
-
callback: fn (next_tick: *Storage.NextTick) void,
|
|
274
|
-
next_tick: *Storage.NextTick,
|
|
275
|
-
) void {
|
|
276
|
-
next_tick.* = .{ .callback = callback };
|
|
277
|
-
storage.next_tick_queue.push(next_tick);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
/// * Verifies that the read fits within the target sector.
|
|
281
|
-
/// * Verifies that the read targets sectors that have been written to.
|
|
282
|
-
pub fn read_sectors(
|
|
283
|
-
storage: *Storage,
|
|
284
|
-
callback: fn (read: *Storage.Read) void,
|
|
285
|
-
read: *Storage.Read,
|
|
286
|
-
buffer: []u8,
|
|
287
|
-
zone: vsr.Zone,
|
|
288
|
-
offset_in_zone: u64,
|
|
289
|
-
) void {
|
|
290
|
-
hash_log.emit_autohash(.{ buffer, zone, offset_in_zone }, .DeepRecursive);
|
|
291
|
-
|
|
292
|
-
verify_alignment(buffer);
|
|
293
|
-
|
|
294
|
-
var sectors = SectorRange.from_zone(zone, offset_in_zone, buffer.len);
|
|
295
|
-
while (sectors.next()) |sector| assert(storage.memory_written.isSet(sector));
|
|
296
|
-
|
|
297
|
-
read.* = .{
|
|
298
|
-
.callback = callback,
|
|
299
|
-
.buffer = buffer,
|
|
300
|
-
.zone = zone,
|
|
301
|
-
.offset = offset_in_zone,
|
|
302
|
-
.done_at_tick = storage.ticks + storage.read_latency(),
|
|
303
|
-
};
|
|
304
|
-
|
|
305
|
-
// We ensure the capacity is sufficient for constants.iops_read_max in init()
|
|
306
|
-
storage.reads.add(read) catch unreachable;
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
fn read_sectors_finish(storage: *Storage, read: *Storage.Read) void {
|
|
310
|
-
hash_log.emit_autohash(.{ read.buffer, read.zone, read.offset }, .DeepRecursive);
|
|
311
|
-
|
|
312
|
-
const offset_in_storage = read.zone.offset(read.offset);
|
|
313
|
-
stdx.copy_disjoint(
|
|
314
|
-
.exact,
|
|
315
|
-
u8,
|
|
316
|
-
read.buffer,
|
|
317
|
-
storage.memory[offset_in_storage..][0..read.buffer.len],
|
|
318
|
-
);
|
|
319
|
-
|
|
320
|
-
if (storage.x_in_100(storage.options.read_fault_probability)) {
|
|
321
|
-
storage.fault_faulty_sectors(read.zone, read.offset, read.buffer.len);
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
if (storage.faulty) {
|
|
325
|
-
// Corrupt faulty sectors.
|
|
326
|
-
var sectors = SectorRange.from_zone(read.zone, read.offset, read.buffer.len);
|
|
327
|
-
const sectors_min = sectors.min;
|
|
328
|
-
while (sectors.next()) |sector| {
|
|
329
|
-
if (storage.faults.isSet(sector)) {
|
|
330
|
-
const faulty_sector_offset = (sector - sectors_min) * constants.sector_size;
|
|
331
|
-
const faulty_sector_bytes = read.buffer[faulty_sector_offset..][0..constants.sector_size];
|
|
332
|
-
storage.prng.random().bytes(faulty_sector_bytes);
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
read.callback(read);
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
pub fn write_sectors(
|
|
341
|
-
storage: *Storage,
|
|
342
|
-
callback: fn (write: *Storage.Write) void,
|
|
343
|
-
write: *Storage.Write,
|
|
344
|
-
buffer: []const u8,
|
|
345
|
-
zone: vsr.Zone,
|
|
346
|
-
offset_in_zone: u64,
|
|
347
|
-
) void {
|
|
348
|
-
hash_log.emit_autohash(.{ buffer, zone, offset_in_zone }, .DeepRecursive);
|
|
349
|
-
|
|
350
|
-
verify_alignment(buffer);
|
|
351
|
-
|
|
352
|
-
// Verify that there are no concurrent overlapping writes.
|
|
353
|
-
var iterator = storage.writes.iterator();
|
|
354
|
-
while (iterator.next()) |other| {
|
|
355
|
-
if (other.zone != zone) continue;
|
|
356
|
-
assert(offset_in_zone + buffer.len <= other.offset or
|
|
357
|
-
other.offset + other.buffer.len <= offset_in_zone);
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
write.* = .{
|
|
361
|
-
.callback = callback,
|
|
362
|
-
.buffer = buffer,
|
|
363
|
-
.zone = zone,
|
|
364
|
-
.offset = offset_in_zone,
|
|
365
|
-
.done_at_tick = storage.ticks + storage.write_latency(),
|
|
366
|
-
};
|
|
367
|
-
|
|
368
|
-
// We ensure the capacity is sufficient for constants.iops_write_max in init()
|
|
369
|
-
storage.writes.add(write) catch unreachable;
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
fn write_sectors_finish(storage: *Storage, write: *Storage.Write) void {
|
|
373
|
-
hash_log.emit_autohash(.{ write.buffer, write.zone, write.offset }, .DeepRecursive);
|
|
374
|
-
|
|
375
|
-
const offset_in_storage = write.zone.offset(write.offset);
|
|
376
|
-
stdx.copy_disjoint(
|
|
377
|
-
.exact,
|
|
378
|
-
u8,
|
|
379
|
-
storage.memory[offset_in_storage..][0..write.buffer.len],
|
|
380
|
-
write.buffer,
|
|
381
|
-
);
|
|
382
|
-
|
|
383
|
-
var sectors = SectorRange.from_zone(write.zone, write.offset, write.buffer.len);
|
|
384
|
-
while (sectors.next()) |sector| {
|
|
385
|
-
storage.faults.unset(sector);
|
|
386
|
-
storage.memory_written.set(sector);
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
if (storage.x_in_100(storage.options.write_fault_probability)) {
|
|
390
|
-
storage.fault_faulty_sectors(write.zone, write.offset, write.buffer.len);
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
write.callback(write);
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
fn read_latency(storage: *Storage) u64 {
|
|
397
|
-
return storage.latency(storage.options.read_latency_min, storage.options.read_latency_mean);
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
fn write_latency(storage: *Storage) u64 {
|
|
401
|
-
return storage.latency(storage.options.write_latency_min, storage.options.write_latency_mean);
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
fn latency(storage: *Storage, min: u64, mean: u64) u64 {
|
|
405
|
-
return min + fuzz.random_int_exponential(storage.prng.random(), u64, mean - min);
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
/// Return true with probability x/100.
|
|
409
|
-
fn x_in_100(storage: *Storage, x: u8) bool {
|
|
410
|
-
assert(x <= 100);
|
|
411
|
-
return x > storage.prng.random().uintLessThan(u8, 100);
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
fn fault_faulty_sectors(storage: *Storage, zone: vsr.Zone, offset_in_zone: u64, size: u64) void {
|
|
415
|
-
const atlas = storage.options.fault_atlas orelse return;
|
|
416
|
-
const replica_index = storage.options.replica_index.?;
|
|
417
|
-
const faulty_sectors = switch (zone) {
|
|
418
|
-
.superblock => atlas.faulty_superblock(replica_index, offset_in_zone, size),
|
|
419
|
-
.wal_headers => atlas.faulty_wal_headers(replica_index, offset_in_zone, size),
|
|
420
|
-
.wal_prepares => atlas.faulty_wal_prepares(replica_index, offset_in_zone, size),
|
|
421
|
-
.grid => null,
|
|
422
|
-
} orelse return;
|
|
423
|
-
|
|
424
|
-
// Randomly corrupt one of the faulty sectors the operation targeted.
|
|
425
|
-
// TODO: inject more realistic and varied storage faults as described above.
|
|
426
|
-
storage.fault_sector(zone, faulty_sectors.random(storage.prng.random()));
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
fn fault_sector(storage: *Storage, zone: vsr.Zone, sector: usize) void {
|
|
430
|
-
storage.faults.set(sector);
|
|
431
|
-
if (storage.options.replica_index) |replica_index| {
|
|
432
|
-
log.debug("{}: corrupting sector at zone={} offset={}", .{
|
|
433
|
-
replica_index,
|
|
434
|
-
zone,
|
|
435
|
-
sector * constants.sector_size - zone.offset(0),
|
|
436
|
-
});
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
pub fn superblock_header(
|
|
441
|
-
storage: *const Storage,
|
|
442
|
-
copy_: u8,
|
|
443
|
-
) *const superblock.SuperBlockHeader {
|
|
444
|
-
const offset = vsr.Zone.superblock.offset(superblock.areas.header.offset(copy_));
|
|
445
|
-
const bytes = storage.memory[offset..][0..superblock.areas.header.size_max];
|
|
446
|
-
return mem.bytesAsValue(superblock.SuperBlockHeader, bytes);
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
pub fn wal_headers(storage: *const Storage) []const vsr.Header {
|
|
450
|
-
const offset = vsr.Zone.wal_headers.offset(0);
|
|
451
|
-
const size = vsr.Zone.wal_headers.size().?;
|
|
452
|
-
return mem.bytesAsSlice(vsr.Header, storage.memory[offset..][0..size]);
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
const MessageRaw = extern struct {
|
|
456
|
-
header: vsr.Header,
|
|
457
|
-
body: [constants.message_size_max - @sizeOf(vsr.Header)]u8,
|
|
458
|
-
|
|
459
|
-
comptime {
|
|
460
|
-
assert(@sizeOf(MessageRaw) == constants.message_size_max);
|
|
461
|
-
assert(@sizeOf(MessageRaw) * 8 == @bitSizeOf(MessageRaw));
|
|
462
|
-
}
|
|
463
|
-
};
|
|
464
|
-
|
|
465
|
-
pub fn wal_prepares(storage: *const Storage) []const MessageRaw {
|
|
466
|
-
const offset = vsr.Zone.wal_prepares.offset(0);
|
|
467
|
-
const size = vsr.Zone.wal_prepares.size().?;
|
|
468
|
-
return mem.bytesAsSlice(MessageRaw, storage.memory[offset..][0..size]);
|
|
469
|
-
}
|
|
470
|
-
|
|
471
|
-
pub fn grid_block(
|
|
472
|
-
storage: *const Storage,
|
|
473
|
-
address: u64,
|
|
474
|
-
) *align(constants.sector_size) [constants.block_size]u8 {
|
|
475
|
-
assert(address > 0);
|
|
476
|
-
|
|
477
|
-
const block_offset = vsr.Zone.grid.offset((address - 1) * constants.block_size);
|
|
478
|
-
const block_header = mem.bytesToValue(
|
|
479
|
-
vsr.Header,
|
|
480
|
-
storage.memory[block_offset..][0..@sizeOf(vsr.Header)],
|
|
481
|
-
);
|
|
482
|
-
assert(storage.memory_written.isSet(@divExact(block_offset, constants.sector_size)));
|
|
483
|
-
assert(block_header.valid_checksum());
|
|
484
|
-
assert(block_header.size <= constants.block_size);
|
|
485
|
-
|
|
486
|
-
return storage.memory[block_offset..][0..constants.block_size];
|
|
487
|
-
}
|
|
488
|
-
};
|
|
489
|
-
|
|
490
|
-
fn verify_alignment(buffer: []const u8) void {
|
|
491
|
-
assert(buffer.len > 0);
|
|
492
|
-
|
|
493
|
-
// Ensure that the read or write is aligned correctly for Direct I/O:
|
|
494
|
-
// If this is not the case, the underlying syscall will return EINVAL.
|
|
495
|
-
assert(@mod(@ptrToInt(buffer.ptr), constants.sector_size) == 0);
|
|
496
|
-
assert(@mod(buffer.len, constants.sector_size) == 0);
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
pub const Area = union(enum) {
|
|
500
|
-
superblock: struct { area: superblock.Area, copy: u8 },
|
|
501
|
-
wal_headers: struct { sector: usize },
|
|
502
|
-
wal_prepares: struct { slot: usize },
|
|
503
|
-
grid: struct { address: u64 },
|
|
504
|
-
|
|
505
|
-
fn sectors(area: Area) SectorRange {
|
|
506
|
-
switch (area) {
|
|
507
|
-
.superblock => |data| SectorRange.from_zone(
|
|
508
|
-
.superblock,
|
|
509
|
-
@field(superblock.areas, data.area).offset(data.copy),
|
|
510
|
-
@field(superblock.areas, data.area).size_max,
|
|
511
|
-
),
|
|
512
|
-
.wal_headers => |data| SectorRange.from_zone(
|
|
513
|
-
.wal_headers,
|
|
514
|
-
constants.sector_size * data.sector,
|
|
515
|
-
constants.sector_size,
|
|
516
|
-
),
|
|
517
|
-
.wal_prepares => |data| SectorRange.from_zone(
|
|
518
|
-
.wal_prepares,
|
|
519
|
-
constants.message_size_max * data.slot,
|
|
520
|
-
constants.message_size_max,
|
|
521
|
-
),
|
|
522
|
-
.grid => |data| SectorRange.from_zone(
|
|
523
|
-
.grid,
|
|
524
|
-
constants.block_size * (data.address - 1),
|
|
525
|
-
constants.block_size,
|
|
526
|
-
),
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
};
|
|
530
|
-
|
|
531
|
-
const SectorRange = struct {
|
|
532
|
-
min: usize, // inclusive sector index
|
|
533
|
-
max: usize, // exclusive sector index
|
|
534
|
-
|
|
535
|
-
fn from_zone(
|
|
536
|
-
zone: vsr.Zone,
|
|
537
|
-
offset_in_zone: u64,
|
|
538
|
-
size: usize,
|
|
539
|
-
) SectorRange {
|
|
540
|
-
return from_offset(zone.offset(offset_in_zone), size);
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
fn from_offset(offset_in_storage: u64, size: usize) SectorRange {
|
|
544
|
-
return .{
|
|
545
|
-
.min = @divExact(offset_in_storage, constants.sector_size),
|
|
546
|
-
.max = @divExact(offset_in_storage + size, constants.sector_size),
|
|
547
|
-
};
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
fn random(range: SectorRange, rand: std.rand.Random) usize {
|
|
551
|
-
return range.min + rand.uintLessThan(usize, range.max - range.min);
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
fn next(range: *SectorRange) ?usize {
|
|
555
|
-
if (range.min == range.max) return null;
|
|
556
|
-
defer range.min += 1;
|
|
557
|
-
return range.min;
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
fn intersect(a: SectorRange, b: SectorRange) ?SectorRange {
|
|
561
|
-
if (a.max <= b.min) return null;
|
|
562
|
-
if (b.max <= a.min) return null;
|
|
563
|
-
return SectorRange{
|
|
564
|
-
.min = std.math.max(a.min, b.min),
|
|
565
|
-
.max = std.math.min(a.max, b.max),
|
|
566
|
-
};
|
|
567
|
-
}
|
|
568
|
-
};
|
|
569
|
-
|
|
570
|
-
/// To ensure the cluster can recover, each header/prepare/block must be valid (not faulty) at
|
|
571
|
-
/// a majority of replicas.
|
|
572
|
-
///
|
|
573
|
-
/// We can't allow WAL storage faults for the same message in a majority of
|
|
574
|
-
/// the replicas as that would make recovery impossible. Instead, we only
|
|
575
|
-
/// allow faults in certain areas which differ between replicas.
|
|
576
|
-
// TODO Support total superblock corruption, forcing a full state transfer.
|
|
577
|
-
pub const ClusterFaultAtlas = struct {
|
|
578
|
-
pub const Options = struct {
|
|
579
|
-
faulty_superblock: bool,
|
|
580
|
-
faulty_wal_headers: bool,
|
|
581
|
-
faulty_wal_prepares: bool,
|
|
582
|
-
// TODO grid
|
|
583
|
-
};
|
|
584
|
-
|
|
585
|
-
/// This is the maximum number of faults per-trailer-area that can be safely injected on a read
|
|
586
|
-
/// or write to the superblock zone.
|
|
587
|
-
///
|
|
588
|
-
/// It does not include the additional "torn write" fault injected upon a crash.
|
|
589
|
-
///
|
|
590
|
-
/// For SuperBlockHeader, checkpoint() and view_change() require 3/4 valid headers (1
|
|
591
|
-
/// fault). Trailers are likewise 3/4 + 1 fault — consider if two faults were injected:
|
|
592
|
-
/// 1. `SuperBlock.checkpoint()` for sequence=6.
|
|
593
|
-
/// - write copy 0, corrupt manifest (fault_count=1)
|
|
594
|
-
/// - write copy 1, corrupt manifest (fault_count=2) !
|
|
595
|
-
/// 2. Crash. Recover.
|
|
596
|
-
/// 3. `SuperBlock.open()`. The highest valid quorum is sequence=6, but there is no
|
|
597
|
-
/// valid manifest.
|
|
598
|
-
const superblock_trailer_faults_max = @divExact(constants.superblock_copies, 2) - 1;
|
|
599
|
-
|
|
600
|
-
comptime {
|
|
601
|
-
assert(superblock_trailer_faults_max >= 1);
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
const CopySet = std.StaticBitSet(constants.superblock_copies);
|
|
605
|
-
const ReplicaSet = std.StaticBitSet(constants.replicas_max);
|
|
606
|
-
const headers_per_sector = @divExact(constants.sector_size, @sizeOf(vsr.Header));
|
|
607
|
-
const header_sectors = @divExact(constants.journal_slot_count, headers_per_sector);
|
|
608
|
-
|
|
609
|
-
const FaultySuperBlockAreas = std.enums.EnumArray(superblock.Area, CopySet);
|
|
610
|
-
const FaultyWALHeaders = std.StaticBitSet(@divExact(
|
|
611
|
-
constants.journal_size_headers,
|
|
612
|
-
constants.sector_size,
|
|
613
|
-
));
|
|
614
|
-
|
|
615
|
-
options: Options,
|
|
616
|
-
faulty_superblock_areas: FaultySuperBlockAreas =
|
|
617
|
-
FaultySuperBlockAreas.initFill(CopySet.initEmpty()),
|
|
618
|
-
faulty_wal_header_sectors: [constants.replicas_max]FaultyWALHeaders =
|
|
619
|
-
[_]FaultyWALHeaders{FaultyWALHeaders.initEmpty()} ** constants.replicas_max,
|
|
620
|
-
|
|
621
|
-
pub fn init(replica_count: u8, random: std.rand.Random, options: Options) ClusterFaultAtlas {
|
|
622
|
-
// If there is only one replica in the cluster, WAL/Grid faults are not recoverable.
|
|
623
|
-
assert(replica_count > 1 or options.faulty_wal_headers == false);
|
|
624
|
-
assert(replica_count > 1 or options.faulty_wal_prepares == false);
|
|
625
|
-
|
|
626
|
-
var atlas = ClusterFaultAtlas{ .options = options };
|
|
627
|
-
|
|
628
|
-
for (&atlas.faulty_superblock_areas.values) |*copies, area| {
|
|
629
|
-
if (area == @enumToInt(superblock.Area.header)) {
|
|
630
|
-
// Only inject read/write faults into trailers, not the header.
|
|
631
|
-
// This prevents the quorum from being lost like so:
|
|
632
|
-
// - copy₀: B (ok)
|
|
633
|
-
// - copy₁: B (torn write)
|
|
634
|
-
// - copy₂: A (corrupt)
|
|
635
|
-
// - copy₃: A (ok)
|
|
636
|
-
} else {
|
|
637
|
-
var area_faults: usize = 0;
|
|
638
|
-
while (area_faults < superblock_trailer_faults_max) : (area_faults += 1) {
|
|
639
|
-
copies.set(random.uintLessThan(usize, constants.superblock_copies));
|
|
640
|
-
}
|
|
641
|
-
}
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
// A cluster-of-2 is special-cased to mirror the special case in replica.zig.
|
|
645
|
-
// See repair_prepare()/on_nack_prepare().
|
|
646
|
-
const quorums = vsr.quorums(replica_count);
|
|
647
|
-
const faults_max = if (replica_count == 2) 1 else replica_count - quorums.replication;
|
|
648
|
-
assert(faults_max < replica_count);
|
|
649
|
-
assert(faults_max > 0 or replica_count == 1);
|
|
650
|
-
|
|
651
|
-
var wal_header_sectors = [_]ReplicaSet{ReplicaSet.initEmpty()} ** header_sectors;
|
|
652
|
-
for (wal_header_sectors) |*wal_header_sector, sector| {
|
|
653
|
-
while (wal_header_sector.count() < faults_max) {
|
|
654
|
-
const replica_index = random.uintLessThan(u8, replica_count);
|
|
655
|
-
wal_header_sector.set(replica_index);
|
|
656
|
-
atlas.faulty_wal_header_sectors[replica_index].set(sector);
|
|
657
|
-
}
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
return atlas;
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
/// Returns a range of faulty sectors which intersect the specified range.
|
|
664
|
-
fn faulty_superblock(
|
|
665
|
-
atlas: ClusterFaultAtlas,
|
|
666
|
-
replica_index: usize,
|
|
667
|
-
offset_in_zone: u64,
|
|
668
|
-
size: u64,
|
|
669
|
-
) ?SectorRange {
|
|
670
|
-
_ = replica_index;
|
|
671
|
-
if (!atlas.options.faulty_superblock) return null;
|
|
672
|
-
|
|
673
|
-
const copy = @divFloor(offset_in_zone, superblock.superblock_copy_size);
|
|
674
|
-
const offset_in_copy = offset_in_zone % superblock.superblock_copy_size;
|
|
675
|
-
const area: superblock.Area = switch (offset_in_copy) {
|
|
676
|
-
superblock.areas.header.base => .header,
|
|
677
|
-
superblock.areas.manifest.base => .manifest,
|
|
678
|
-
superblock.areas.free_set.base => .free_set,
|
|
679
|
-
superblock.areas.client_table.base => .client_table,
|
|
680
|
-
else => unreachable,
|
|
681
|
-
};
|
|
682
|
-
|
|
683
|
-
if (atlas.faulty_superblock_areas.get(area).isSet(copy)) {
|
|
684
|
-
return SectorRange.from_zone(.superblock, offset_in_zone, size);
|
|
685
|
-
} else {
|
|
686
|
-
return null;
|
|
687
|
-
}
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
/// Returns a range of faulty sectors which intersect the specified range.
|
|
691
|
-
fn faulty_wal_headers(
|
|
692
|
-
atlas: ClusterFaultAtlas,
|
|
693
|
-
replica_index: usize,
|
|
694
|
-
offset_in_zone: u64,
|
|
695
|
-
size: u64,
|
|
696
|
-
) ?SectorRange {
|
|
697
|
-
if (!atlas.options.faulty_wal_headers) return null;
|
|
698
|
-
return faulty_sectors(
|
|
699
|
-
FaultyWALHeaders.bit_length,
|
|
700
|
-
constants.sector_size,
|
|
701
|
-
.wal_headers,
|
|
702
|
-
&atlas.faulty_wal_header_sectors[replica_index],
|
|
703
|
-
offset_in_zone,
|
|
704
|
-
size,
|
|
705
|
-
);
|
|
706
|
-
}
|
|
707
|
-
|
|
708
|
-
/// Returns a range of faulty sectors which intersect the specified range.
|
|
709
|
-
fn faulty_wal_prepares(
|
|
710
|
-
atlas: ClusterFaultAtlas,
|
|
711
|
-
replica_index: usize,
|
|
712
|
-
offset_in_zone: u64,
|
|
713
|
-
size: u64,
|
|
714
|
-
) ?SectorRange {
|
|
715
|
-
if (!atlas.options.faulty_wal_prepares) return null;
|
|
716
|
-
return faulty_sectors(
|
|
717
|
-
FaultyWALHeaders.bit_length,
|
|
718
|
-
constants.message_size_max * headers_per_sector,
|
|
719
|
-
.wal_prepares,
|
|
720
|
-
&atlas.faulty_wal_header_sectors[replica_index],
|
|
721
|
-
offset_in_zone,
|
|
722
|
-
size,
|
|
723
|
-
);
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
fn faulty_sectors(
|
|
727
|
-
comptime chunk_count: usize,
|
|
728
|
-
comptime chunk_size: usize,
|
|
729
|
-
comptime zone: vsr.Zone,
|
|
730
|
-
faulty_chunks: *const std.StaticBitSet(chunk_count),
|
|
731
|
-
offset_in_zone: u64,
|
|
732
|
-
size: u64,
|
|
733
|
-
) ?SectorRange {
|
|
734
|
-
var fault_start: ?usize = null;
|
|
735
|
-
var fault_count: usize = 0;
|
|
736
|
-
|
|
737
|
-
var chunk: usize = @divFloor(offset_in_zone, chunk_size);
|
|
738
|
-
while (chunk * chunk_size < offset_in_zone + size) : (chunk += 1) {
|
|
739
|
-
if (faulty_chunks.isSet(chunk)) {
|
|
740
|
-
if (fault_start == null) fault_start = chunk;
|
|
741
|
-
fault_count += 1;
|
|
742
|
-
} else {
|
|
743
|
-
if (fault_start != null) break;
|
|
744
|
-
}
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
if (fault_start) |start| {
|
|
748
|
-
return SectorRange.from_zone(
|
|
749
|
-
zone,
|
|
750
|
-
chunk_size * start,
|
|
751
|
-
chunk_size * fault_count,
|
|
752
|
-
).intersect(SectorRange.from_zone(zone, offset_in_zone, size)).?;
|
|
753
|
-
} else {
|
|
754
|
-
return null;
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
};
|