tigerbeetle-node 0.10.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +302 -101
- package/dist/index.d.ts +70 -72
- package/dist/index.js +70 -72
- package/dist/index.js.map +1 -1
- package/package.json +9 -8
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +6 -10
- package/src/node.zig +6 -3
- package/src/tigerbeetle/scripts/benchmark.sh +4 -4
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +19 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +24 -14
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
- package/src/tigerbeetle/src/benchmark.zig +29 -13
- package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
- package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
- package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -257
- package/src/tigerbeetle/src/c/tb_client.h +118 -84
- package/src/tigerbeetle/src/c/tb_client.zig +88 -23
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
- package/src/tigerbeetle/src/c/test.zig +371 -1
- package/src/tigerbeetle/src/cli.zig +37 -7
- package/src/tigerbeetle/src/config.zig +58 -17
- package/src/tigerbeetle/src/demo.zig +5 -2
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
- package/src/tigerbeetle/src/ewah.zig +11 -33
- package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
- package/src/tigerbeetle/src/io/linux.zig +1 -1
- package/src/tigerbeetle/src/lsm/README.md +308 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +376 -397
- package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
- package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
- package/src/tigerbeetle/src/lsm/forest.zig +21 -447
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +414 -0
- package/src/tigerbeetle/src/lsm/grid.zig +170 -76
- package/src/tigerbeetle/src/lsm/groove.zig +197 -133
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
- package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
- package/src/tigerbeetle/src/lsm/manifest.zig +93 -180
- package/src/tigerbeetle/src/lsm/manifest_level.zig +161 -454
- package/src/tigerbeetle/src/lsm/manifest_log.zig +243 -356
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +65 -76
- package/src/tigerbeetle/src/lsm/segmented_array.zig +580 -251
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
- package/src/tigerbeetle/src/lsm/table.zig +115 -68
- package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
- package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -17
- package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
- package/src/tigerbeetle/src/lsm/test.zig +61 -56
- package/src/tigerbeetle/src/lsm/tree.zig +450 -407
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +461 -0
- package/src/tigerbeetle/src/main.zig +83 -8
- package/src/tigerbeetle/src/message_bus.zig +20 -9
- package/src/tigerbeetle/src/message_pool.zig +22 -19
- package/src/tigerbeetle/src/ring_buffer.zig +7 -3
- package/src/tigerbeetle/src/simulator.zig +179 -119
- package/src/tigerbeetle/src/state_machine.zig +381 -246
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +3 -7
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +823 -0
- package/src/tigerbeetle/src/test/cluster.zig +33 -81
- package/src/tigerbeetle/src/test/conductor.zig +366 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/network.zig +45 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +91 -69
- package/src/tigerbeetle/src/test/state_machine.zig +11 -35
- package/src/tigerbeetle/src/test/storage.zig +470 -106
- package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +15 -16
- package/src/tigerbeetle/src/unit_tests.zig +13 -1
- package/src/tigerbeetle/src/util.zig +97 -11
- package/src/tigerbeetle/src/vopr.zig +495 -0
- package/src/tigerbeetle/src/vsr/client.zig +21 -3
- package/src/tigerbeetle/src/vsr/journal.zig +293 -212
- package/src/tigerbeetle/src/vsr/replica.zig +1086 -515
- package/src/tigerbeetle/src/vsr/superblock.zig +382 -637
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +14 -16
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +416 -153
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +62 -12
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
- package/src/tigerbeetle/src/vsr.zig +94 -60
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
- package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
- package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
//! SuperBlock invariants:
|
|
2
|
+
//!
|
|
3
|
+
//! * vsr_state
|
|
4
|
+
//! - vsr_state.commit_min is initially 0 (for a newly-formatted replica).
|
|
5
|
+
//! - vsr_state.commit_min ≤ vsr_state.commit_max
|
|
6
|
+
//! - vsr_state.view_normal ≤ vsr_state.view
|
|
7
|
+
//! - checkpoint() must advance the superblock's vsr_state.commit_min.
|
|
8
|
+
//! - view_change() must not advance the superblock's vsr_state.commit_min.
|
|
9
|
+
//! - All fields of vsr_state except commit_min_checksum are monotonically increasing over
|
|
10
|
+
//! view_change()/checkpoint().
|
|
11
|
+
//!
|
|
1
12
|
const std = @import("std");
|
|
2
13
|
const assert = std.debug.assert;
|
|
3
14
|
const crypto = std.crypto;
|
|
@@ -15,18 +26,11 @@ const MessagePool = @import("../message_pool.zig").MessagePool;
|
|
|
15
26
|
pub const SuperBlockManifest = @import("superblock_manifest.zig").Manifest;
|
|
16
27
|
pub const SuperBlockFreeSet = @import("superblock_free_set.zig").FreeSet;
|
|
17
28
|
pub const SuperBlockClientTable = @import("superblock_client_table.zig").ClientTable;
|
|
29
|
+
pub const Quorums = @import("superblock_quorums.zig").QuorumsType(.{
|
|
30
|
+
.superblock_copies = config.superblock_copies,
|
|
31
|
+
});
|
|
18
32
|
|
|
19
|
-
|
|
20
|
-
pub const Magic = enum(u8) {
|
|
21
|
-
superblock,
|
|
22
|
-
manifest,
|
|
23
|
-
prepare,
|
|
24
|
-
index,
|
|
25
|
-
filter,
|
|
26
|
-
data,
|
|
27
|
-
};
|
|
28
|
-
|
|
29
|
-
pub const SuperBlockVersion: u8 = 0;
|
|
33
|
+
pub const SuperBlockVersion: u16 = 0;
|
|
30
34
|
|
|
31
35
|
// Fields are aligned to work as an extern or packed struct.
|
|
32
36
|
pub const SuperBlockSector = extern struct {
|
|
@@ -38,20 +42,20 @@ pub const SuperBlockSector = extern struct {
|
|
|
38
42
|
/// This simplifies writing and comparing multiple copies.
|
|
39
43
|
copy: u8 = 0,
|
|
40
44
|
|
|
41
|
-
/// Protects against
|
|
42
|
-
|
|
45
|
+
/// Protects against writing to or reading from the wrong data file.
|
|
46
|
+
replica: u8,
|
|
43
47
|
|
|
44
48
|
/// The version of the superblock format in use, reserved for major breaking changes.
|
|
45
|
-
version:
|
|
49
|
+
version: u16,
|
|
46
50
|
|
|
47
51
|
/// Protects against writing to or reading from the wrong data file.
|
|
48
|
-
replica: u8,
|
|
49
52
|
cluster: u32,
|
|
50
53
|
|
|
51
54
|
/// The current size of the data file.
|
|
52
55
|
size: u64,
|
|
53
56
|
|
|
54
57
|
/// The maximum size of the data file.
|
|
58
|
+
// TODO Actually limit the file to this size.
|
|
55
59
|
size_max: u64,
|
|
56
60
|
|
|
57
61
|
/// A monotonically increasing counter to locate the latest superblock at startup.
|
|
@@ -92,9 +96,12 @@ pub const SuperBlockSector = extern struct {
|
|
|
92
96
|
/// The size of the client table entries stored in the superblock trailer.
|
|
93
97
|
client_table_size: u32,
|
|
94
98
|
|
|
95
|
-
reserved: [
|
|
99
|
+
reserved: [3148]u8 = [_]u8{0} ** 3148,
|
|
96
100
|
|
|
97
101
|
pub const VSRState = extern struct {
|
|
102
|
+
/// The vsr.Header.checksum of commit_min's message.
|
|
103
|
+
commit_min_checksum: u128,
|
|
104
|
+
|
|
98
105
|
/// The last operation committed to the state machine. At startup, replay the log hereafter.
|
|
99
106
|
commit_min: u64,
|
|
100
107
|
|
|
@@ -107,8 +114,22 @@ pub const SuperBlockSector = extern struct {
|
|
|
107
114
|
/// The view number of the replica.
|
|
108
115
|
view: u32,
|
|
109
116
|
|
|
117
|
+
reserved: [8]u8 = [_]u8{0} ** 8,
|
|
118
|
+
|
|
110
119
|
comptime {
|
|
111
|
-
assert(@sizeOf(VSRState) ==
|
|
120
|
+
assert(@sizeOf(VSRState) == 48);
|
|
121
|
+
// Assert that there is no implicit padding in the struct.
|
|
122
|
+
assert(@bitSizeOf(VSRState) == @sizeOf(VSRState) * 8);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
pub fn root(cluster: u32) VSRState {
|
|
126
|
+
return .{
|
|
127
|
+
.commit_min_checksum = vsr.Header.root_prepare(cluster).checksum,
|
|
128
|
+
.commit_min = 0,
|
|
129
|
+
.commit_max = 0,
|
|
130
|
+
.view_normal = 0,
|
|
131
|
+
.view = 0,
|
|
132
|
+
};
|
|
112
133
|
}
|
|
113
134
|
|
|
114
135
|
pub fn internally_consistent(state: VSRState) bool {
|
|
@@ -118,6 +139,10 @@ pub const SuperBlockSector = extern struct {
|
|
|
118
139
|
pub fn monotonic(old: VSRState, new: VSRState) bool {
|
|
119
140
|
assert(old.internally_consistent());
|
|
120
141
|
assert(new.internally_consistent());
|
|
142
|
+
// The last case is for when checking monotonic() from the sequence=0 sector.
|
|
143
|
+
assert(old.commit_min != new.commit_min or
|
|
144
|
+
old.commit_min_checksum == new.commit_min_checksum or
|
|
145
|
+
(old.commit_min_checksum == 0 and old.commit_min == 0));
|
|
121
146
|
|
|
122
147
|
if (old.view > new.view) return false;
|
|
123
148
|
if (old.view_normal > new.view_normal) return false;
|
|
@@ -137,6 +162,16 @@ pub const SuperBlockSector = extern struct {
|
|
|
137
162
|
assert(state.would_be_updated_by(new));
|
|
138
163
|
state.* = new;
|
|
139
164
|
}
|
|
165
|
+
|
|
166
|
+
/// Compaction is one bar ahead of superblock's commit_min.
|
|
167
|
+
/// The commits from the bar following commit_min were in the mutable table, and
|
|
168
|
+
/// thus not preserved in the checkpoint.
|
|
169
|
+
/// But the corresponding `compact()` updates were preserved, and must not be repeated
|
|
170
|
+
/// to ensure determinstic storage.
|
|
171
|
+
pub fn op_compacted(state: VSRState, op: u64) bool {
|
|
172
|
+
// If commit_min is 0, we have never checkpointed, so no compactions are checkpointed.
|
|
173
|
+
return state.commit_min > 0 and op <= state.commit_min + config.lsm_batch_multiple;
|
|
174
|
+
}
|
|
140
175
|
};
|
|
141
176
|
|
|
142
177
|
pub const Snapshot = extern struct {
|
|
@@ -163,11 +198,15 @@ pub const SuperBlockSector = extern struct {
|
|
|
163
198
|
|
|
164
199
|
comptime {
|
|
165
200
|
assert(@sizeOf(Snapshot) == 24);
|
|
201
|
+
// Assert that there is no implicit padding in the struct.
|
|
202
|
+
assert(@bitSizeOf(Snapshot) == @sizeOf(Snapshot) * 8);
|
|
166
203
|
}
|
|
167
204
|
};
|
|
168
205
|
|
|
169
206
|
comptime {
|
|
170
207
|
assert(@sizeOf(SuperBlockSector) == config.sector_size);
|
|
208
|
+
// Assert that there is no implicit padding in the struct.
|
|
209
|
+
assert(@bitSizeOf(SuperBlockSector) == @sizeOf(SuperBlockSector) * 8);
|
|
171
210
|
}
|
|
172
211
|
|
|
173
212
|
pub fn calculate_checksum(superblock: *const SuperBlockSector) u128 {
|
|
@@ -186,12 +225,13 @@ pub const SuperBlockSector = extern struct {
|
|
|
186
225
|
}
|
|
187
226
|
|
|
188
227
|
pub fn set_checksum(superblock: *SuperBlockSector) void {
|
|
189
|
-
assert(superblock.copy <
|
|
190
|
-
assert(superblock.magic == .superblock);
|
|
228
|
+
assert(superblock.copy < config.superblock_copies);
|
|
191
229
|
assert(superblock.version == SuperBlockVersion);
|
|
192
230
|
assert(superblock.flags == 0);
|
|
193
231
|
|
|
194
|
-
|
|
232
|
+
assert(@bitCast(u32, superblock.reserved[0..4].*) == 0);
|
|
233
|
+
for (mem.bytesAsSlice(u64, superblock.reserved[4..])) |word| assert(word == 0);
|
|
234
|
+
for (mem.bytesAsSlice(u64, &superblock.vsr_state.reserved)) |word| assert(word == 0);
|
|
195
235
|
|
|
196
236
|
superblock.checksum = superblock.calculate_checksum();
|
|
197
237
|
}
|
|
@@ -202,9 +242,6 @@ pub const SuperBlockSector = extern struct {
|
|
|
202
242
|
|
|
203
243
|
/// Does not consider { checksum, copy } when comparing equality.
|
|
204
244
|
pub fn equal(a: *const SuperBlockSector, b: *const SuperBlockSector) bool {
|
|
205
|
-
assert(a.magic == .superblock);
|
|
206
|
-
assert(b.magic == .superblock);
|
|
207
|
-
|
|
208
245
|
if (a.version != b.version) return false;
|
|
209
246
|
if (a.replica != b.replica) return false;
|
|
210
247
|
if (a.cluster != b.cluster) return false;
|
|
@@ -221,8 +258,13 @@ pub const SuperBlockSector = extern struct {
|
|
|
221
258
|
if (a.manifest_size != b.manifest_size) return false;
|
|
222
259
|
if (a.free_set_size != b.free_set_size) return false;
|
|
223
260
|
|
|
224
|
-
|
|
225
|
-
|
|
261
|
+
assert(@bitCast(u32, a.reserved[0..4].*) == 0);
|
|
262
|
+
assert(@bitCast(u32, b.reserved[0..4].*) == 0);
|
|
263
|
+
for (mem.bytesAsSlice(u64, a.reserved[4..])) |word| assert(word == 0);
|
|
264
|
+
for (mem.bytesAsSlice(u64, b.reserved[4..])) |word| assert(word == 0);
|
|
265
|
+
|
|
266
|
+
for (mem.bytesAsSlice(u64, &a.vsr_state.reserved)) |word| assert(word == 0);
|
|
267
|
+
for (mem.bytesAsSlice(u64, &b.vsr_state.reserved)) |word| assert(word == 0);
|
|
226
268
|
|
|
227
269
|
return true;
|
|
228
270
|
}
|
|
@@ -236,20 +278,12 @@ comptime {
|
|
|
236
278
|
}
|
|
237
279
|
|
|
238
280
|
/// The size of the entire superblock storage zone.
|
|
239
|
-
pub const superblock_zone_size =
|
|
240
|
-
|
|
241
|
-
/// A single set of copies (a copy set) consists of config.superblock_copies of a superblock.
|
|
242
|
-
/// At least two copy sets are required for copy-on-write in order not to impair existing copies.
|
|
243
|
-
///
|
|
244
|
-
/// However, when writing only the superblock sector for a view change, we do update-in-place,
|
|
245
|
-
/// which is necessary as we need to continue to reference the existing superblock trailer to
|
|
246
|
-
/// decouple view changes from checkpoints, to not force an untimely checkpoint ahead of schedule.
|
|
247
|
-
pub const superblock_copies_max = config.superblock_copies * 2;
|
|
281
|
+
pub const superblock_zone_size = superblock_copy_size * config.superblock_copies;
|
|
248
282
|
|
|
249
283
|
/// The size of an individual superblock including trailer.
|
|
250
|
-
pub const
|
|
284
|
+
pub const superblock_copy_size = @sizeOf(SuperBlockSector) + superblock_trailer_size_max;
|
|
251
285
|
comptime {
|
|
252
|
-
assert(
|
|
286
|
+
assert(superblock_copy_size % config.sector_size == 0);
|
|
253
287
|
}
|
|
254
288
|
|
|
255
289
|
/// The maximum possible size of the superblock trailer, following the superblock sector.
|
|
@@ -270,7 +304,9 @@ pub const superblock_trailer_size_max = blk: {
|
|
|
270
304
|
|
|
271
305
|
// We order the smaller manifest section ahead of the block free set for better access locality.
|
|
272
306
|
// For example, it's cheaper to skip over 1 MiB when reading from disk than to skip over 32 MiB.
|
|
273
|
-
break :blk superblock_trailer_manifest_size_max +
|
|
307
|
+
break :blk superblock_trailer_manifest_size_max +
|
|
308
|
+
superblock_trailer_free_set_size_max +
|
|
309
|
+
superblock_trailer_client_table_size_max;
|
|
274
310
|
};
|
|
275
311
|
|
|
276
312
|
// A manifest block reference of 40 bytes contains a tree hash, checksum, and address.
|
|
@@ -280,6 +316,7 @@ pub const superblock_trailer_manifest_size_max = blk: {
|
|
|
280
316
|
|
|
281
317
|
// Use a multiple of sector * reference so that the size is exactly divisible without padding:
|
|
282
318
|
// For example, this 2.5 MiB manifest trailer == 65536 references == 65536 * 511 or 34m tables.
|
|
319
|
+
// TODO Size this relative to the expected number of tables & fragmentation.
|
|
283
320
|
break :blk 16 * config.sector_size * SuperBlockManifest.BlockReferenceSize;
|
|
284
321
|
};
|
|
285
322
|
|
|
@@ -303,6 +340,33 @@ pub const data_file_size_min = blk: {
|
|
|
303
340
|
break :blk superblock_zone_size + config.journal_size_max;
|
|
304
341
|
};
|
|
305
342
|
|
|
343
|
+
/// This table shows the sequence number progression of the SuperBlock's sectors.
|
|
344
|
+
///
|
|
345
|
+
/// action working staging disk
|
|
346
|
+
/// format seq seq seq
|
|
347
|
+
/// 0 - Initially the file has no sectors.
|
|
348
|
+
/// 0 1 -
|
|
349
|
+
/// 0 1 1 Write a copyset for the first sequence.
|
|
350
|
+
/// 1 1 1 Read quorum; verify 3/4 are valid.
|
|
351
|
+
///
|
|
352
|
+
/// open seq seq seq
|
|
353
|
+
/// a
|
|
354
|
+
/// a a Read quorum; verify 2/4 are valid.
|
|
355
|
+
/// a (a) a Repair any broken copies of `a`.
|
|
356
|
+
///
|
|
357
|
+
/// checkpoint seq seq seq
|
|
358
|
+
/// a a a
|
|
359
|
+
/// a a+1
|
|
360
|
+
/// a a+1 a+1
|
|
361
|
+
/// a+1 a+1 a+1 Read quorum; verify 3/4 are valid.
|
|
362
|
+
///
|
|
363
|
+
/// view_change seq seq seq
|
|
364
|
+
/// a a
|
|
365
|
+
/// a a+1 a The new sequence reuses the original parent.
|
|
366
|
+
/// a a+1 a+1
|
|
367
|
+
/// a+1 a+1 a+1 Read quorum; verify 3/4 are valid.
|
|
368
|
+
/// working staging disk
|
|
369
|
+
///
|
|
306
370
|
pub fn SuperBlockType(comptime Storage: type) type {
|
|
307
371
|
return struct {
|
|
308
372
|
const SuperBlock = @This();
|
|
@@ -325,8 +389,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
325
389
|
|
|
326
390
|
write: Storage.Write = undefined,
|
|
327
391
|
read: Storage.Read = undefined,
|
|
328
|
-
|
|
329
|
-
|
|
392
|
+
read_threshold: ?Quorums.Threshold = null,
|
|
393
|
+
copy: ?u8 = null,
|
|
394
|
+
/// Used by format(), checkpoint(), and view_change().
|
|
395
|
+
vsr_state: ?SuperBlockSector.VSRState = null,
|
|
396
|
+
repairs: ?Quorums.RepairIterator = null, // Used by open().
|
|
330
397
|
};
|
|
331
398
|
|
|
332
399
|
storage: *Storage,
|
|
@@ -341,14 +408,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
341
408
|
working: *align(config.sector_size) SuperBlockSector,
|
|
342
409
|
|
|
343
410
|
/// The superblock that will replace the current working superblock once written.
|
|
344
|
-
/// This is used when writing the staging superblock, or when changing views before then.
|
|
345
411
|
/// We cannot mutate any working state directly until it is safely on stable storage.
|
|
346
412
|
/// Otherwise, we may accidentally externalize guarantees that are not yet durable.
|
|
347
|
-
writing: *align(config.sector_size) SuperBlockSector,
|
|
348
|
-
|
|
349
|
-
/// The superblock that will be checkpointed next.
|
|
350
|
-
/// This may be updated incrementally several times before the next checkpoint.
|
|
351
|
-
/// For example, to track new snapshots as they are registered.
|
|
352
413
|
staging: *align(config.sector_size) SuperBlockSector,
|
|
353
414
|
|
|
354
415
|
/// The copies that we read into at startup or when verifying the written superblock.
|
|
@@ -402,11 +463,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
402
463
|
const b = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
|
|
403
464
|
errdefer allocator.free(b);
|
|
404
465
|
|
|
405
|
-
const c = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
|
|
406
|
-
errdefer allocator.free(c);
|
|
407
|
-
|
|
408
466
|
const reading = try allocator.allocAdvanced(
|
|
409
|
-
[config.superblock_copies
|
|
467
|
+
[config.superblock_copies]SuperBlockSector,
|
|
410
468
|
config.sector_size,
|
|
411
469
|
1,
|
|
412
470
|
.exact,
|
|
@@ -456,8 +514,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
456
514
|
return SuperBlock{
|
|
457
515
|
.storage = storage,
|
|
458
516
|
.working = &a[0],
|
|
459
|
-
.
|
|
460
|
-
.staging = &c[0],
|
|
517
|
+
.staging = &b[0],
|
|
461
518
|
.reading = &reading[0],
|
|
462
519
|
.manifest = manifest,
|
|
463
520
|
.free_set = free_set,
|
|
@@ -469,11 +526,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
469
526
|
}
|
|
470
527
|
|
|
471
528
|
pub fn deinit(superblock: *SuperBlock, allocator: mem.Allocator) void {
|
|
472
|
-
assert(superblock.queue_head == null);
|
|
473
|
-
assert(superblock.queue_tail == null);
|
|
474
|
-
|
|
475
529
|
allocator.destroy(superblock.working);
|
|
476
|
-
allocator.destroy(superblock.writing);
|
|
477
530
|
allocator.destroy(superblock.staging);
|
|
478
531
|
allocator.free(superblock.reading);
|
|
479
532
|
|
|
@@ -503,15 +556,13 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
503
556
|
assert(!superblock.opened);
|
|
504
557
|
|
|
505
558
|
assert(options.replica < config.replicas_max);
|
|
506
|
-
|
|
507
|
-
assert(options.size_max > superblock_zone_size);
|
|
559
|
+
assert(options.size_max >= data_file_size_min);
|
|
508
560
|
assert(options.size_max % config.sector_size == 0);
|
|
509
561
|
|
|
510
562
|
// This working copy provides the parent checksum, and will not be written to disk.
|
|
511
563
|
// We therefore use zero values to make this parent checksum as stable as possible.
|
|
512
564
|
superblock.working.* = .{
|
|
513
565
|
.copy = 0,
|
|
514
|
-
.magic = .superblock,
|
|
515
566
|
.version = SuperBlockVersion,
|
|
516
567
|
.sequence = 0,
|
|
517
568
|
.replica = options.replica,
|
|
@@ -523,6 +574,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
523
574
|
.free_set_checksum = 0,
|
|
524
575
|
.client_table_checksum = 0,
|
|
525
576
|
.vsr_state = .{
|
|
577
|
+
.commit_min_checksum = 0,
|
|
526
578
|
.commit_min = 0,
|
|
527
579
|
.commit_max = 0,
|
|
528
580
|
.view_normal = 0,
|
|
@@ -542,15 +594,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
542
594
|
|
|
543
595
|
superblock.working.set_checksum();
|
|
544
596
|
|
|
545
|
-
superblock.staging.* = superblock.working.*;
|
|
546
|
-
superblock.staging.sequence = superblock.working.sequence + 1;
|
|
547
|
-
superblock.staging.parent = superblock.working.checksum;
|
|
548
|
-
|
|
549
597
|
context.* = .{
|
|
550
598
|
.superblock = superblock,
|
|
551
599
|
.callback = callback,
|
|
552
600
|
.caller = .format,
|
|
553
|
-
.
|
|
601
|
+
.vsr_state = SuperBlockSector.VSRState.root(options.cluster),
|
|
554
602
|
};
|
|
555
603
|
|
|
556
604
|
// TODO At a higher layer, we must:
|
|
@@ -576,23 +624,33 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
576
624
|
superblock.acquire(context);
|
|
577
625
|
}
|
|
578
626
|
|
|
627
|
+
/// The vsr_state must update the commit_min and commit_min_checksum.
|
|
628
|
+
// TODO Will the replica ever update view/view_normal by calling checkpoint() during a view
|
|
629
|
+
// change? If not, forbid it.
|
|
579
630
|
pub fn checkpoint(
|
|
580
631
|
superblock: *SuperBlock,
|
|
581
632
|
callback: fn (context: *Context) void,
|
|
582
633
|
context: *Context,
|
|
634
|
+
vsr_state: SuperBlockSector.VSRState,
|
|
583
635
|
) void {
|
|
584
636
|
assert(superblock.opened);
|
|
637
|
+
// Checkpoint must advance commit_min, but never the view.
|
|
638
|
+
assert(superblock.staging.vsr_state.would_be_updated_by(vsr_state));
|
|
639
|
+
assert(superblock.staging.vsr_state.commit_min < vsr_state.commit_min);
|
|
640
|
+
assert(superblock.staging.vsr_state.commit_min_checksum !=
|
|
641
|
+
vsr_state.commit_min_checksum);
|
|
585
642
|
|
|
586
643
|
context.* = .{
|
|
587
644
|
.superblock = superblock,
|
|
588
645
|
.callback = callback,
|
|
589
646
|
.caller = .checkpoint,
|
|
590
|
-
.
|
|
647
|
+
.vsr_state = vsr_state,
|
|
591
648
|
};
|
|
592
649
|
|
|
593
650
|
superblock.acquire(context);
|
|
594
651
|
}
|
|
595
652
|
|
|
653
|
+
/// The vsr_state must not update the `commit_min` or `commit_min_checksum`.
|
|
596
654
|
pub fn view_change(
|
|
597
655
|
superblock: *SuperBlock,
|
|
598
656
|
callback: fn (context: *Context) void,
|
|
@@ -600,20 +658,28 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
600
658
|
vsr_state: SuperBlockSector.VSRState,
|
|
601
659
|
) void {
|
|
602
660
|
assert(superblock.opened);
|
|
661
|
+
assert(vsr_state.commit_min == superblock.staging.vsr_state.commit_min);
|
|
662
|
+
assert(vsr_state.commit_min_checksum ==
|
|
663
|
+
superblock.staging.vsr_state.commit_min_checksum);
|
|
664
|
+
assert(superblock.staging.vsr_state.monotonic(vsr_state));
|
|
603
665
|
|
|
604
666
|
log.debug(
|
|
605
|
-
"view_change:
|
|
667
|
+
"view_change: commit_min_checksum={}..{} commit_min={}..{} commit_max={}..{} " ++
|
|
668
|
+
"view_normal={}..{} view={}..{}",
|
|
606
669
|
.{
|
|
607
|
-
superblock.
|
|
670
|
+
superblock.staging.vsr_state.commit_min_checksum,
|
|
671
|
+
vsr_state.commit_min_checksum,
|
|
672
|
+
|
|
673
|
+
superblock.staging.vsr_state.commit_min,
|
|
608
674
|
vsr_state.commit_min,
|
|
609
675
|
|
|
610
|
-
superblock.
|
|
676
|
+
superblock.staging.vsr_state.commit_max,
|
|
611
677
|
vsr_state.commit_max,
|
|
612
678
|
|
|
613
|
-
superblock.
|
|
679
|
+
superblock.staging.vsr_state.view_normal,
|
|
614
680
|
vsr_state.view_normal,
|
|
615
681
|
|
|
616
|
-
superblock.
|
|
682
|
+
superblock.staging.vsr_state.view,
|
|
617
683
|
vsr_state.view,
|
|
618
684
|
},
|
|
619
685
|
);
|
|
@@ -624,14 +690,10 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
624
690
|
.superblock = superblock,
|
|
625
691
|
.callback = callback,
|
|
626
692
|
.caller = .view_change,
|
|
627
|
-
.copy = undefined,
|
|
628
693
|
.vsr_state = vsr_state,
|
|
629
694
|
};
|
|
630
695
|
|
|
631
|
-
|
|
632
|
-
assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
|
|
633
|
-
|
|
634
|
-
if (!superblock.working.vsr_state.would_be_updated_by(context.vsr_state)) {
|
|
696
|
+
if (!superblock.staging.vsr_state.would_be_updated_by(context.vsr_state.?)) {
|
|
635
697
|
log.debug("view_change: no change", .{});
|
|
636
698
|
callback(context);
|
|
637
699
|
return;
|
|
@@ -657,34 +719,32 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
657
719
|
}
|
|
658
720
|
|
|
659
721
|
fn write_staging(superblock: *SuperBlock, context: *Context) void {
|
|
660
|
-
assert(context.caller
|
|
722
|
+
assert(context.caller != .open);
|
|
661
723
|
assert(context.caller == .format or superblock.opened);
|
|
724
|
+
assert(context.copy == null);
|
|
725
|
+
assert(context.vsr_state.?.internally_consistent());
|
|
662
726
|
assert(superblock.queue_head == context);
|
|
663
727
|
assert(superblock.queue_tail == null);
|
|
728
|
+
assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state.?));
|
|
664
729
|
|
|
665
|
-
superblock.
|
|
666
|
-
superblock.
|
|
667
|
-
superblock.
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
superblock.staging.
|
|
676
|
-
superblock.staging.parent = superblock.writing.checksum;
|
|
677
|
-
|
|
678
|
-
assert(superblock.writing.manifest_checksum == superblock.staging.manifest_checksum);
|
|
679
|
-
assert(superblock.writing.free_set_checksum == superblock.staging.free_set_checksum);
|
|
680
|
-
assert(superblock.writing.client_table_checksum == superblock.staging.client_table_checksum);
|
|
681
|
-
|
|
682
|
-
assert(superblock.writing.manifest_size == superblock.staging.manifest_size);
|
|
683
|
-
assert(superblock.writing.free_set_size == superblock.staging.free_set_size);
|
|
684
|
-
assert(superblock.writing.client_table_size == superblock.staging.client_table_size);
|
|
730
|
+
superblock.staging.* = superblock.working.*;
|
|
731
|
+
superblock.staging.sequence = superblock.staging.sequence + 1;
|
|
732
|
+
superblock.staging.parent = superblock.staging.checksum;
|
|
733
|
+
superblock.staging.vsr_state.update(context.vsr_state.?);
|
|
734
|
+
|
|
735
|
+
if (context.caller != .view_change) {
|
|
736
|
+
superblock.write_staging_encode_manifest();
|
|
737
|
+
superblock.write_staging_encode_free_set();
|
|
738
|
+
superblock.write_staging_encode_client_table();
|
|
739
|
+
}
|
|
740
|
+
superblock.staging.set_checksum();
|
|
685
741
|
|
|
686
|
-
context.copy =
|
|
687
|
-
|
|
742
|
+
context.copy = 0;
|
|
743
|
+
if (context.caller == .view_change) {
|
|
744
|
+
superblock.write_sector(context);
|
|
745
|
+
} else {
|
|
746
|
+
superblock.write_manifest(context);
|
|
747
|
+
}
|
|
688
748
|
}
|
|
689
749
|
|
|
690
750
|
fn write_staging_encode_manifest(superblock: *SuperBlock) void {
|
|
@@ -710,6 +770,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
710
770
|
if (superblock.free_set.highest_address_acquired()) |address| {
|
|
711
771
|
staging.size += address * config.block_size;
|
|
712
772
|
}
|
|
773
|
+
assert(staging.size >= data_file_size_min);
|
|
774
|
+
assert(staging.size <= staging.size_max);
|
|
713
775
|
|
|
714
776
|
staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
|
|
715
777
|
staging.free_set_checksum = vsr.checksum(target[0..staging.free_set_size]);
|
|
@@ -723,54 +785,25 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
723
785
|
staging.client_table_checksum = vsr.checksum(target[0..staging.client_table_size]);
|
|
724
786
|
}
|
|
725
787
|
|
|
726
|
-
fn write_view_change(superblock: *SuperBlock, context: *Context) void {
|
|
727
|
-
assert(context.caller == .view_change);
|
|
728
|
-
assert(superblock.opened);
|
|
729
|
-
assert(superblock.queue_head == context);
|
|
730
|
-
assert(superblock.queue_tail == null);
|
|
731
|
-
assert(context.vsr_state.internally_consistent());
|
|
732
|
-
assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
|
|
733
|
-
assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state));
|
|
734
|
-
|
|
735
|
-
superblock.writing.* = superblock.working.*;
|
|
736
|
-
|
|
737
|
-
// We cannot increment the sequence number when writing only the superblock sector as
|
|
738
|
-
// this would write the sector to another copy set with different superblock trailers.
|
|
739
|
-
// Instead, we increment twice so that the sector remains in the same copy set.
|
|
740
|
-
superblock.writing.sequence += 2;
|
|
741
|
-
assert(superblock.writing.parent == superblock.working.parent);
|
|
742
|
-
|
|
743
|
-
superblock.writing.vsr_state.update(context.vsr_state);
|
|
744
|
-
superblock.staging.vsr_state.update(context.vsr_state);
|
|
745
|
-
|
|
746
|
-
superblock.writing.set_checksum();
|
|
747
|
-
|
|
748
|
-
superblock.staging.sequence = superblock.writing.sequence + 1;
|
|
749
|
-
superblock.staging.parent = superblock.writing.checksum;
|
|
750
|
-
|
|
751
|
-
context.copy = starting_copy_for_sequence(superblock.writing.sequence);
|
|
752
|
-
superblock.write_sector(context);
|
|
753
|
-
}
|
|
754
|
-
|
|
755
788
|
fn write_manifest(superblock: *SuperBlock, context: *Context) void {
|
|
756
789
|
assert(superblock.queue_head == context);
|
|
757
790
|
|
|
758
|
-
const size = vsr.sector_ceil(superblock.
|
|
791
|
+
const size = vsr.sector_ceil(superblock.staging.manifest_size);
|
|
759
792
|
assert(size <= superblock_trailer_manifest_size_max);
|
|
760
793
|
|
|
761
794
|
const buffer = superblock.manifest_buffer[0..size];
|
|
762
|
-
const offset = offset_manifest(context.copy
|
|
795
|
+
const offset = Layout.offset_manifest(context.copy.?);
|
|
763
796
|
|
|
764
|
-
mem.set(u8, buffer[superblock.
|
|
797
|
+
mem.set(u8, buffer[superblock.staging.manifest_size..], 0); // Zero sector padding.
|
|
765
798
|
|
|
766
|
-
assert(superblock.
|
|
767
|
-
superblock.manifest_buffer[0..superblock.
|
|
799
|
+
assert(superblock.staging.manifest_checksum == vsr.checksum(
|
|
800
|
+
superblock.manifest_buffer[0..superblock.staging.manifest_size],
|
|
768
801
|
));
|
|
769
802
|
|
|
770
803
|
log.debug("{s}: write_manifest: checksum={x} size={} offset={}", .{
|
|
771
804
|
@tagName(context.caller),
|
|
772
|
-
superblock.
|
|
773
|
-
superblock.
|
|
805
|
+
superblock.staging.manifest_checksum,
|
|
806
|
+
superblock.staging.manifest_size,
|
|
774
807
|
offset,
|
|
775
808
|
});
|
|
776
809
|
|
|
@@ -798,22 +831,22 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
798
831
|
fn write_free_set(superblock: *SuperBlock, context: *Context) void {
|
|
799
832
|
assert(superblock.queue_head == context);
|
|
800
833
|
|
|
801
|
-
const size = vsr.sector_ceil(superblock.
|
|
834
|
+
const size = vsr.sector_ceil(superblock.staging.free_set_size);
|
|
802
835
|
assert(size <= superblock_trailer_free_set_size_max);
|
|
803
836
|
|
|
804
837
|
const buffer = superblock.free_set_buffer[0..size];
|
|
805
|
-
const offset = offset_free_set(context.copy
|
|
838
|
+
const offset = Layout.offset_free_set(context.copy.?);
|
|
806
839
|
|
|
807
|
-
mem.set(u8, buffer[superblock.
|
|
840
|
+
mem.set(u8, buffer[superblock.staging.free_set_size..], 0); // Zero sector padding.
|
|
808
841
|
|
|
809
|
-
assert(superblock.
|
|
810
|
-
superblock.free_set_buffer[0..superblock.
|
|
842
|
+
assert(superblock.staging.free_set_checksum == vsr.checksum(
|
|
843
|
+
superblock.free_set_buffer[0..superblock.staging.free_set_size],
|
|
811
844
|
));
|
|
812
845
|
|
|
813
846
|
log.debug("{s}: write_free_set: checksum={x} size={} offset={}", .{
|
|
814
847
|
@tagName(context.caller),
|
|
815
|
-
superblock.
|
|
816
|
-
superblock.
|
|
848
|
+
superblock.staging.free_set_checksum,
|
|
849
|
+
superblock.staging.free_set_size,
|
|
817
850
|
offset,
|
|
818
851
|
});
|
|
819
852
|
|
|
@@ -841,22 +874,22 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
841
874
|
fn write_client_table(superblock: *SuperBlock, context: *Context) void {
|
|
842
875
|
assert(superblock.queue_head == context);
|
|
843
876
|
|
|
844
|
-
const size = vsr.sector_ceil(superblock.
|
|
877
|
+
const size = vsr.sector_ceil(superblock.staging.client_table_size);
|
|
845
878
|
assert(size <= superblock_trailer_client_table_size_max);
|
|
846
879
|
|
|
847
880
|
const buffer = superblock.client_table_buffer[0..size];
|
|
848
|
-
const offset = offset_client_table(context.copy
|
|
881
|
+
const offset = Layout.offset_client_table(context.copy.?);
|
|
849
882
|
|
|
850
|
-
mem.set(u8, buffer[superblock.
|
|
883
|
+
mem.set(u8, buffer[superblock.staging.client_table_size..], 0); // Zero sector padding.
|
|
851
884
|
|
|
852
|
-
assert(superblock.
|
|
853
|
-
superblock.client_table_buffer[0..superblock.
|
|
885
|
+
assert(superblock.staging.client_table_checksum == vsr.checksum(
|
|
886
|
+
superblock.client_table_buffer[0..superblock.staging.client_table_size],
|
|
854
887
|
));
|
|
855
888
|
|
|
856
889
|
log.debug("{s}: write_client_table: checksum={x} size={} offset={}", .{
|
|
857
890
|
@tagName(context.caller),
|
|
858
|
-
superblock.
|
|
859
|
-
superblock.
|
|
891
|
+
superblock.staging.client_table_checksum,
|
|
892
|
+
superblock.staging.client_table_size,
|
|
860
893
|
offset,
|
|
861
894
|
});
|
|
862
895
|
|
|
@@ -884,42 +917,42 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
884
917
|
fn write_sector(superblock: *SuperBlock, context: *Context) void {
|
|
885
918
|
assert(superblock.queue_head == context);
|
|
886
919
|
|
|
887
|
-
// We
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
920
|
+
// We update the working superblock for a checkpoint/format/view_change:
|
|
921
|
+
// open() does not update the working superblock, since it only writes to repair.
|
|
922
|
+
if (context.caller == .open) {
|
|
923
|
+
assert(superblock.staging.sequence == superblock.working.sequence);
|
|
924
|
+
} else {
|
|
925
|
+
assert(superblock.staging.sequence == superblock.working.sequence + 1);
|
|
926
|
+
assert(superblock.staging.parent == superblock.working.checksum);
|
|
927
|
+
}
|
|
895
928
|
|
|
896
929
|
// The superblock cluster and replica should never change once formatted:
|
|
897
|
-
assert(superblock.
|
|
898
|
-
assert(superblock.
|
|
899
|
-
|
|
900
|
-
assert(superblock.
|
|
930
|
+
assert(superblock.staging.cluster == superblock.working.cluster);
|
|
931
|
+
assert(superblock.staging.replica == superblock.working.replica);
|
|
932
|
+
|
|
933
|
+
assert(superblock.staging.size >= data_file_size_min);
|
|
934
|
+
assert(superblock.staging.size <= superblock.staging.size_max);
|
|
901
935
|
|
|
902
|
-
assert(context.copy <
|
|
903
|
-
|
|
904
|
-
assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
|
|
905
|
-
superblock.writing.copy = context.copy;
|
|
936
|
+
assert(context.copy.? < config.superblock_copies);
|
|
937
|
+
superblock.staging.copy = context.copy.?;
|
|
906
938
|
|
|
907
939
|
// Updating the copy number should not affect the checksum, which was previously set:
|
|
908
|
-
assert(superblock.
|
|
940
|
+
assert(superblock.staging.valid_checksum());
|
|
909
941
|
|
|
910
|
-
const buffer = mem.asBytes(superblock.
|
|
911
|
-
const offset =
|
|
942
|
+
const buffer = mem.asBytes(superblock.staging);
|
|
943
|
+
const offset = Layout.offset_sector(context.copy.?);
|
|
912
944
|
|
|
913
|
-
log.debug("{s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
|
|
945
|
+
log.debug("{}: {s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
|
|
946
|
+
superblock.staging.replica,
|
|
914
947
|
@tagName(context.caller),
|
|
915
|
-
superblock.
|
|
916
|
-
superblock.
|
|
917
|
-
context.copy
|
|
948
|
+
superblock.staging.checksum,
|
|
949
|
+
superblock.staging.sequence,
|
|
950
|
+
context.copy.?,
|
|
918
951
|
buffer.len,
|
|
919
952
|
offset,
|
|
920
953
|
});
|
|
921
954
|
|
|
922
|
-
superblock.assert_bounds(offset, buffer.len
|
|
955
|
+
superblock.assert_bounds(offset, buffer.len);
|
|
923
956
|
|
|
924
957
|
superblock.storage.write_sectors(
|
|
925
958
|
write_sector_callback,
|
|
@@ -933,25 +966,24 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
933
966
|
fn write_sector_callback(write: *Storage.Write) void {
|
|
934
967
|
const context = @fieldParentPtr(Context, "write", write);
|
|
935
968
|
const superblock = context.superblock;
|
|
969
|
+
const copy = context.copy.?;
|
|
936
970
|
|
|
937
971
|
assert(superblock.queue_head == context);
|
|
938
972
|
|
|
939
|
-
assert(
|
|
940
|
-
assert(
|
|
941
|
-
assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
|
|
942
|
-
assert(context.copy == superblock.writing.copy);
|
|
973
|
+
assert(copy < config.superblock_copies);
|
|
974
|
+
assert(copy == superblock.staging.copy);
|
|
943
975
|
|
|
944
|
-
if (context.
|
|
945
|
-
|
|
946
|
-
|
|
976
|
+
if (context.caller == .open) {
|
|
977
|
+
context.copy = null;
|
|
978
|
+
superblock.repair(context);
|
|
979
|
+
return;
|
|
980
|
+
}
|
|
947
981
|
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
superblock.read_working(context);
|
|
952
|
-
}
|
|
982
|
+
if (copy + 1 == config.superblock_copies) {
|
|
983
|
+
context.copy = null;
|
|
984
|
+
superblock.read_working(context, .verify);
|
|
953
985
|
} else {
|
|
954
|
-
context.copy
|
|
986
|
+
context.copy = copy + 1;
|
|
955
987
|
|
|
956
988
|
switch (context.caller) {
|
|
957
989
|
.open => unreachable,
|
|
@@ -961,34 +993,42 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
961
993
|
}
|
|
962
994
|
}
|
|
963
995
|
|
|
964
|
-
fn read_working(
|
|
996
|
+
fn read_working(
|
|
997
|
+
superblock: *SuperBlock,
|
|
998
|
+
context: *Context,
|
|
999
|
+
threshold: Quorums.Threshold,
|
|
1000
|
+
) void {
|
|
965
1001
|
assert(superblock.queue_head == context);
|
|
1002
|
+
assert(context.copy == null);
|
|
1003
|
+
assert(context.read_threshold == null);
|
|
966
1004
|
|
|
967
1005
|
// We do not submit reads in parallel, as while this would shave off 1ms, it would also
|
|
968
1006
|
// increase the risk that a single fault applies to more reads due to temporal locality.
|
|
969
1007
|
// This would make verification reads more flaky when we do experience a read fault.
|
|
970
1008
|
// See "An Analysis of Data Corruption in the Storage Stack".
|
|
971
1009
|
|
|
972
|
-
context.copy = 0;
|
|
1010
|
+
context.copy = 0;
|
|
1011
|
+
context.read_threshold = threshold;
|
|
973
1012
|
for (superblock.reading) |*copy| copy.* = undefined;
|
|
974
1013
|
superblock.read_sector(context);
|
|
975
1014
|
}
|
|
976
1015
|
|
|
977
1016
|
fn read_sector(superblock: *SuperBlock, context: *Context) void {
|
|
978
1017
|
assert(superblock.queue_head == context);
|
|
979
|
-
assert(context.copy <
|
|
1018
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1019
|
+
assert(context.read_threshold != null);
|
|
980
1020
|
|
|
981
|
-
const buffer = mem.asBytes(&superblock.reading[context.copy]);
|
|
982
|
-
const offset =
|
|
1021
|
+
const buffer = mem.asBytes(&superblock.reading[context.copy.?]);
|
|
1022
|
+
const offset = Layout.offset_sector(context.copy.?);
|
|
983
1023
|
|
|
984
1024
|
log.debug("{s}: read_sector: copy={} size={} offset={}", .{
|
|
985
1025
|
@tagName(context.caller),
|
|
986
|
-
context.copy
|
|
1026
|
+
context.copy.?,
|
|
987
1027
|
buffer.len,
|
|
988
1028
|
offset,
|
|
989
1029
|
});
|
|
990
1030
|
|
|
991
|
-
superblock.assert_bounds(offset, buffer.len
|
|
1031
|
+
superblock.assert_bounds(offset, buffer.len);
|
|
992
1032
|
|
|
993
1033
|
superblock.storage.read_sectors(
|
|
994
1034
|
read_sector_callback,
|
|
@@ -1002,96 +1042,109 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1002
1042
|
fn read_sector_callback(read: *Storage.Read) void {
|
|
1003
1043
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1004
1044
|
const superblock = context.superblock;
|
|
1045
|
+
const threshold = context.read_threshold.?;
|
|
1005
1046
|
|
|
1006
1047
|
assert(superblock.queue_head == context);
|
|
1007
1048
|
|
|
1008
|
-
assert(context.copy <
|
|
1009
|
-
if (context.copy
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
.format, .checkpoint, .view_change => {
|
|
1015
|
-
if (working.checksum != superblock.writing.checksum) {
|
|
1016
|
-
@panic("superblock failed verification after writing");
|
|
1017
|
-
}
|
|
1018
|
-
assert(working.equal(superblock.writing));
|
|
1019
|
-
assert(superblock.staging.sequence == working.sequence + 1);
|
|
1020
|
-
assert(superblock.staging.parent == working.checksum);
|
|
1021
|
-
},
|
|
1022
|
-
.open => {
|
|
1023
|
-
superblock.staging.* = working.*;
|
|
1024
|
-
superblock.staging.sequence = working.sequence + 1;
|
|
1025
|
-
superblock.staging.parent = working.checksum;
|
|
1026
|
-
},
|
|
1027
|
-
}
|
|
1049
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1050
|
+
if (context.copy.? + 1 != config.superblock_copies) {
|
|
1051
|
+
context.copy = context.copy.? + 1;
|
|
1052
|
+
superblock.read_sector(context);
|
|
1053
|
+
return;
|
|
1054
|
+
}
|
|
1028
1055
|
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
superblock
|
|
1056
|
+
context.read_threshold = null;
|
|
1057
|
+
context.copy = null;
|
|
1058
|
+
|
|
1059
|
+
if (superblock.quorums.working(superblock.reading, threshold)) |quorum| {
|
|
1060
|
+
assert(quorum.valid);
|
|
1061
|
+
assert(quorum.copies.count() >= threshold.count());
|
|
1062
|
+
|
|
1063
|
+
const working = quorum.sector;
|
|
1064
|
+
if (threshold == .verify) {
|
|
1065
|
+
if (working.checksum != superblock.staging.checksum) {
|
|
1066
|
+
@panic("superblock failed verification after writing");
|
|
1040
1067
|
}
|
|
1068
|
+
assert(working.equal(superblock.staging));
|
|
1069
|
+
}
|
|
1041
1070
|
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1071
|
+
if (context.caller == .format) {
|
|
1072
|
+
assert(working.sequence == 1);
|
|
1073
|
+
assert(working.size == data_file_size_min);
|
|
1074
|
+
assert(working.manifest_size == 0);
|
|
1075
|
+
assert(working.free_set_size == 8);
|
|
1076
|
+
assert(working.client_table_size == 4);
|
|
1077
|
+
assert(working.vsr_state.commit_min_checksum ==
|
|
1078
|
+
vsr.Header.root_prepare(working.cluster).checksum);
|
|
1079
|
+
assert(working.vsr_state.commit_min == 0);
|
|
1080
|
+
assert(working.vsr_state.commit_max == 0);
|
|
1081
|
+
assert(working.vsr_state.view_normal == 0);
|
|
1082
|
+
assert(working.vsr_state.view == 0);
|
|
1083
|
+
} else if (context.caller == .checkpoint) {
|
|
1084
|
+
superblock.free_set.checkpoint();
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
superblock.working.* = working.*;
|
|
1088
|
+
superblock.staging.* = working.*;
|
|
1089
|
+
log.debug(
|
|
1090
|
+
"{s}: installed working superblock: checksum={x} sequence={} cluster={} " ++
|
|
1091
|
+
"replica={} size={} " ++
|
|
1092
|
+
"commit_min_checksum={} commit_min={} commit_max={} " ++
|
|
1093
|
+
"view_normal={} view={}",
|
|
1094
|
+
.{
|
|
1095
|
+
@tagName(context.caller),
|
|
1096
|
+
superblock.working.checksum,
|
|
1097
|
+
superblock.working.sequence,
|
|
1098
|
+
superblock.working.cluster,
|
|
1099
|
+
superblock.working.replica,
|
|
1100
|
+
superblock.working.size,
|
|
1101
|
+
superblock.working.vsr_state.commit_min_checksum,
|
|
1102
|
+
superblock.working.vsr_state.commit_min,
|
|
1103
|
+
superblock.working.vsr_state.commit_max,
|
|
1104
|
+
superblock.working.vsr_state.view_normal,
|
|
1105
|
+
superblock.working.vsr_state.view,
|
|
1106
|
+
},
|
|
1107
|
+
);
|
|
1108
|
+
|
|
1109
|
+
if (context.caller == .open) {
|
|
1110
|
+
if (context.repairs) |_| {
|
|
1111
|
+
// We just verified that the repair completed.
|
|
1112
|
+
assert(threshold == .verify);
|
|
1065
1113
|
superblock.release(context);
|
|
1114
|
+
} else {
|
|
1115
|
+
assert(threshold == .open);
|
|
1116
|
+
context.copy = 0;
|
|
1117
|
+
context.repairs = quorum.repairs();
|
|
1118
|
+
superblock.read_manifest(context);
|
|
1066
1119
|
}
|
|
1067
|
-
} else
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
error.ParentNotFound => @panic("superblock parent not found"),
|
|
1071
|
-
error.ParentQuorumLost => @panic("superblock parent quorum lost"),
|
|
1072
|
-
error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
|
|
1073
|
-
error.SequenceNotMonotonic => @panic("superblock sequence not monotonic"),
|
|
1120
|
+
} else {
|
|
1121
|
+
// TODO Consider calling TRIM() on Grid's free suffix after checkpointing.
|
|
1122
|
+
superblock.release(context);
|
|
1074
1123
|
}
|
|
1075
|
-
} else {
|
|
1076
|
-
|
|
1077
|
-
|
|
1124
|
+
} else |err| switch (err) {
|
|
1125
|
+
error.Fork => @panic("superblock forked"),
|
|
1126
|
+
error.NotFound => @panic("superblock not found"),
|
|
1127
|
+
error.QuorumLost => @panic("superblock quorum lost"),
|
|
1128
|
+
error.ParentNotConnected => @panic("superblock parent not connected"),
|
|
1129
|
+
error.ParentSkipped => @panic("superblock parent superseded"),
|
|
1130
|
+
error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
|
|
1078
1131
|
}
|
|
1079
1132
|
}
|
|
1080
1133
|
|
|
1081
1134
|
fn read_manifest(superblock: *SuperBlock, context: *Context) void {
|
|
1082
1135
|
assert(context.caller == .open);
|
|
1083
1136
|
assert(superblock.queue_head == context);
|
|
1084
|
-
assert(context.copy <
|
|
1137
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1085
1138
|
|
|
1086
1139
|
const size = vsr.sector_ceil(superblock.working.manifest_size);
|
|
1087
1140
|
assert(size <= superblock_trailer_manifest_size_max);
|
|
1088
1141
|
|
|
1089
1142
|
const buffer = superblock.manifest_buffer[0..size];
|
|
1090
|
-
const offset = offset_manifest(context.copy
|
|
1143
|
+
const offset = Layout.offset_manifest(context.copy.?);
|
|
1091
1144
|
|
|
1092
1145
|
log.debug("{s}: read_manifest: copy={} size={} offset={}", .{
|
|
1093
1146
|
@tagName(context.caller),
|
|
1094
|
-
context.copy
|
|
1147
|
+
context.copy.?,
|
|
1095
1148
|
buffer.len,
|
|
1096
1149
|
offset,
|
|
1097
1150
|
});
|
|
@@ -1115,6 +1168,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1115
1168
|
fn read_manifest_callback(read: *Storage.Read) void {
|
|
1116
1169
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1117
1170
|
const superblock = context.superblock;
|
|
1171
|
+
const copy = context.copy.?;
|
|
1118
1172
|
|
|
1119
1173
|
assert(context.caller == .open);
|
|
1120
1174
|
assert(superblock.queue_head == context);
|
|
@@ -1133,12 +1187,13 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1133
1187
|
// TODO Repair any impaired copies before we continue.
|
|
1134
1188
|
// At present, we repair at the next checkpoint.
|
|
1135
1189
|
// We do not repair padding.
|
|
1136
|
-
context.copy =
|
|
1190
|
+
context.copy = 0;
|
|
1137
1191
|
superblock.read_free_set(context);
|
|
1138
|
-
} else if (
|
|
1192
|
+
} else if (copy + 1 == config.superblock_copies) {
|
|
1139
1193
|
@panic("superblock manifest lost");
|
|
1140
1194
|
} else {
|
|
1141
|
-
|
|
1195
|
+
log.debug("open: read_manifest: corrupt copy={}", .{copy});
|
|
1196
|
+
context.copy = copy + 1;
|
|
1142
1197
|
superblock.read_manifest(context);
|
|
1143
1198
|
}
|
|
1144
1199
|
}
|
|
@@ -1146,17 +1201,17 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1146
1201
|
fn read_free_set(superblock: *SuperBlock, context: *Context) void {
|
|
1147
1202
|
assert(context.caller == .open);
|
|
1148
1203
|
assert(superblock.queue_head == context);
|
|
1149
|
-
assert(context.copy <
|
|
1204
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1150
1205
|
|
|
1151
1206
|
const size = vsr.sector_ceil(superblock.working.free_set_size);
|
|
1152
1207
|
assert(size <= superblock_trailer_free_set_size_max);
|
|
1153
1208
|
|
|
1154
1209
|
const buffer = superblock.free_set_buffer[0..size];
|
|
1155
|
-
const offset = offset_free_set(context.copy
|
|
1210
|
+
const offset = Layout.offset_free_set(context.copy.?);
|
|
1156
1211
|
|
|
1157
1212
|
log.debug("{s}: read_free_set: copy={} size={} offset={}", .{
|
|
1158
1213
|
@tagName(context.caller),
|
|
1159
|
-
context.copy
|
|
1214
|
+
context.copy.?,
|
|
1160
1215
|
buffer.len,
|
|
1161
1216
|
offset,
|
|
1162
1217
|
});
|
|
@@ -1180,6 +1235,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1180
1235
|
fn read_free_set_callback(read: *Storage.Read) void {
|
|
1181
1236
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1182
1237
|
const superblock = context.superblock;
|
|
1238
|
+
const copy = context.copy.?;
|
|
1183
1239
|
|
|
1184
1240
|
assert(context.caller == .open);
|
|
1185
1241
|
assert(superblock.queue_head == context);
|
|
@@ -1199,10 +1255,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1199
1255
|
|
|
1200
1256
|
// TODO Repair any impaired copies before we continue.
|
|
1201
1257
|
superblock.read_client_table(context);
|
|
1202
|
-
} else if (
|
|
1258
|
+
} else if (copy + 1 == config.superblock_copies) {
|
|
1203
1259
|
@panic("superblock free set lost");
|
|
1204
1260
|
} else {
|
|
1205
|
-
|
|
1261
|
+
log.debug("open: read_free_set: corrupt copy={}", .{copy});
|
|
1262
|
+
context.copy = copy + 1;
|
|
1206
1263
|
superblock.read_free_set(context);
|
|
1207
1264
|
}
|
|
1208
1265
|
}
|
|
@@ -1217,17 +1274,17 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1217
1274
|
fn read_client_table(superblock: *SuperBlock, context: *Context) void {
|
|
1218
1275
|
assert(context.caller == .open);
|
|
1219
1276
|
assert(superblock.queue_head == context);
|
|
1220
|
-
assert(context.copy <
|
|
1277
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1221
1278
|
|
|
1222
1279
|
const size = vsr.sector_ceil(superblock.working.client_table_size);
|
|
1223
1280
|
assert(size <= superblock_trailer_client_table_size_max);
|
|
1224
1281
|
|
|
1225
1282
|
const buffer = superblock.client_table_buffer[0..size];
|
|
1226
|
-
const offset = offset_client_table(context.copy
|
|
1283
|
+
const offset = Layout.offset_client_table(context.copy.?);
|
|
1227
1284
|
|
|
1228
1285
|
log.debug("{s}: read_client_table: copy={} size={} offset={}", .{
|
|
1229
1286
|
@tagName(context.caller),
|
|
1230
|
-
context.copy
|
|
1287
|
+
context.copy.?,
|
|
1231
1288
|
buffer.len,
|
|
1232
1289
|
offset,
|
|
1233
1290
|
});
|
|
@@ -1251,6 +1308,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1251
1308
|
fn read_client_table_callback(read: *Storage.Read) void {
|
|
1252
1309
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1253
1310
|
const superblock = context.superblock;
|
|
1311
|
+
const copy = context.copy.?;
|
|
1254
1312
|
|
|
1255
1313
|
assert(context.caller == .open);
|
|
1256
1314
|
assert(superblock.queue_head == context);
|
|
@@ -1266,16 +1324,33 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1266
1324
|
config.clients_max,
|
|
1267
1325
|
});
|
|
1268
1326
|
|
|
1269
|
-
|
|
1270
|
-
superblock.
|
|
1271
|
-
} else if (
|
|
1327
|
+
context.copy = null;
|
|
1328
|
+
superblock.repair(context);
|
|
1329
|
+
} else if (copy + 1 == config.superblock_copies) {
|
|
1272
1330
|
@panic("superblock client table lost");
|
|
1273
1331
|
} else {
|
|
1274
|
-
|
|
1332
|
+
log.debug("open: read_client_table: corrupt copy={}", .{copy});
|
|
1333
|
+
context.copy = copy + 1;
|
|
1275
1334
|
superblock.read_client_table(context);
|
|
1276
1335
|
}
|
|
1277
1336
|
}
|
|
1278
1337
|
|
|
1338
|
+
fn repair(superblock: *SuperBlock, context: *Context) void {
|
|
1339
|
+
assert(context.caller == .open);
|
|
1340
|
+
assert(context.copy == null);
|
|
1341
|
+
assert(superblock.queue_head == context);
|
|
1342
|
+
|
|
1343
|
+
if (context.repairs.?.next()) |repair_copy| {
|
|
1344
|
+
context.copy = repair_copy;
|
|
1345
|
+
log.warn("repair: copy={}", .{repair_copy});
|
|
1346
|
+
|
|
1347
|
+
superblock.staging.* = superblock.working.*;
|
|
1348
|
+
superblock.write_manifest(context);
|
|
1349
|
+
} else {
|
|
1350
|
+
superblock.release(context);
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1279
1354
|
fn acquire(superblock: *SuperBlock, context: *Context) void {
|
|
1280
1355
|
if (superblock.queue_head) |head| {
|
|
1281
1356
|
// There should be nothing else happening when we format() or open():
|
|
@@ -1298,11 +1373,10 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1298
1373
|
superblock.queue_head = context;
|
|
1299
1374
|
log.debug("{s}: started", .{@tagName(context.caller)});
|
|
1300
1375
|
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
.view_change => superblock.write_view_change(context),
|
|
1376
|
+
if (context.caller == .open) {
|
|
1377
|
+
superblock.read_working(context, .open);
|
|
1378
|
+
} else {
|
|
1379
|
+
superblock.write_staging(context);
|
|
1306
1380
|
}
|
|
1307
1381
|
}
|
|
1308
1382
|
}
|
|
@@ -1312,19 +1386,24 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1312
1386
|
|
|
1313
1387
|
log.debug("{s}: complete", .{@tagName(context.caller)});
|
|
1314
1388
|
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1389
|
+
switch (context.caller) {
|
|
1390
|
+
.format => {},
|
|
1391
|
+
.open => {
|
|
1392
|
+
assert(!superblock.opened);
|
|
1393
|
+
superblock.opened = true;
|
|
1318
1394
|
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1395
|
+
if (superblock.working.manifest_size > 0) {
|
|
1396
|
+
assert(superblock.manifest.count > 0);
|
|
1397
|
+
}
|
|
1398
|
+
// TODO Make the FreeSet encoding format not dependant on the word size.
|
|
1399
|
+
if (superblock.working.free_set_size > @sizeOf(usize)) {
|
|
1400
|
+
assert(superblock.free_set.count_acquired() > 0);
|
|
1401
|
+
}
|
|
1402
|
+
},
|
|
1403
|
+
.checkpoint, .view_change => {
|
|
1404
|
+
assert(meta.eql(superblock.staging.vsr_state, context.vsr_state.?));
|
|
1405
|
+
assert(meta.eql(superblock.working.vsr_state, context.vsr_state.?));
|
|
1406
|
+
},
|
|
1328
1407
|
}
|
|
1329
1408
|
|
|
1330
1409
|
const queue_tail = superblock.queue_tail;
|
|
@@ -1340,40 +1419,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1340
1419
|
assert(offset + size <= superblock.storage_offset + superblock.storage_size);
|
|
1341
1420
|
}
|
|
1342
1421
|
|
|
1343
|
-
fn offset_manifest(copy: u8, sequence: u64) u64 {
|
|
1344
|
-
assert(copy >= starting_copy_for_sequence(sequence));
|
|
1345
|
-
assert(copy <= stopping_copy_for_sequence(sequence));
|
|
1346
|
-
|
|
1347
|
-
return superblock_size * copy + @sizeOf(SuperBlockSector);
|
|
1348
|
-
}
|
|
1349
|
-
|
|
1350
|
-
fn offset_free_set(copy: u8, sequence: u64) u64 {
|
|
1351
|
-
assert(copy >= starting_copy_for_sequence(sequence));
|
|
1352
|
-
assert(copy <= stopping_copy_for_sequence(sequence));
|
|
1353
|
-
|
|
1354
|
-
return superblock_size * copy + @sizeOf(SuperBlockSector) +
|
|
1355
|
-
superblock_trailer_manifest_size_max;
|
|
1356
|
-
}
|
|
1357
|
-
|
|
1358
|
-
fn offset_client_table(copy: u8, sequence: u64) u64 {
|
|
1359
|
-
assert(copy >= starting_copy_for_sequence(sequence));
|
|
1360
|
-
assert(copy <= stopping_copy_for_sequence(sequence));
|
|
1361
|
-
|
|
1362
|
-
return superblock_size * copy + @sizeOf(SuperBlockSector) +
|
|
1363
|
-
superblock_trailer_manifest_size_max +
|
|
1364
|
-
superblock_trailer_free_set_size_max;
|
|
1365
|
-
}
|
|
1366
|
-
|
|
1367
|
-
/// Returns the first copy index (inclusive) to be written for a sequence number.
|
|
1368
|
-
fn starting_copy_for_sequence(sequence: u64) u8 {
|
|
1369
|
-
return config.superblock_copies * @intCast(u8, sequence % 2);
|
|
1370
|
-
}
|
|
1371
|
-
|
|
1372
|
-
/// Returns the last copy index (inclusive) to be written for a sequence number.
|
|
1373
|
-
fn stopping_copy_for_sequence(sequence: u64) u8 {
|
|
1374
|
-
return starting_copy_for_sequence(sequence) + config.superblock_copies - 1;
|
|
1375
|
-
}
|
|
1376
|
-
|
|
1377
1422
|
/// We use flexible quorums for even quorums with write quorum > read quorum, for example:
|
|
1378
1423
|
/// * When writing, we must verify that at least 3/4 copies were written.
|
|
1379
1424
|
/// * At startup, we must verify that at least 2/4 copies were read.
|
|
@@ -1404,217 +1449,25 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1404
1449
|
};
|
|
1405
1450
|
}
|
|
1406
1451
|
|
|
1407
|
-
const
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
valid: bool = false,
|
|
1412
|
-
};
|
|
1413
|
-
|
|
1414
|
-
const QuorumCount = std.StaticBitSet(superblock_copies_max);
|
|
1415
|
-
|
|
1416
|
-
array: [superblock_copies_max]Quorum = undefined,
|
|
1417
|
-
count: u8 = 0,
|
|
1418
|
-
|
|
1419
|
-
pub const Error = error{
|
|
1420
|
-
NotFound,
|
|
1421
|
-
QuorumLost,
|
|
1422
|
-
ParentNotFound,
|
|
1423
|
-
ParentQuorumLost,
|
|
1424
|
-
SequenceNotMonotonic,
|
|
1425
|
-
VSRStateNotMonotonic,
|
|
1426
|
-
};
|
|
1427
|
-
|
|
1428
|
-
/// Returns the working superblock according to the quorum with the highest sequence number.
|
|
1429
|
-
/// Verifies that the highest quorum is connected, that the previous quorum was not lost.
|
|
1430
|
-
/// i.e. Both the working and previous quorum must be valid and intact and connected.
|
|
1431
|
-
/// Otherwise, we might regress to a previous working superblock.
|
|
1432
|
-
pub fn working(
|
|
1433
|
-
quorums: *Quorums,
|
|
1434
|
-
copies: []SuperBlockSector,
|
|
1435
|
-
threshold: u8,
|
|
1436
|
-
) Error!*const SuperBlockSector {
|
|
1437
|
-
assert(copies.len == superblock_copies_max);
|
|
1438
|
-
assert(threshold >= 2 and threshold <= 5);
|
|
1439
|
-
|
|
1440
|
-
quorums.array = undefined;
|
|
1441
|
-
quorums.count = 0;
|
|
1442
|
-
|
|
1443
|
-
for (copies) |*copy, index| quorums.count_copy(copy, index, threshold);
|
|
1444
|
-
|
|
1445
|
-
std.sort.sort(Quorum, quorums.slice(), {}, sort_priority_descending);
|
|
1446
|
-
|
|
1447
|
-
for (quorums.slice()) |quorum| {
|
|
1448
|
-
if (quorum.count.count() == config.superblock_copies) {
|
|
1449
|
-
log.debug("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
|
|
1450
|
-
quorum.sector.checksum,
|
|
1451
|
-
quorum.sector.parent,
|
|
1452
|
-
quorum.sector.sequence,
|
|
1453
|
-
quorum.count.count(),
|
|
1454
|
-
quorum.valid,
|
|
1455
|
-
});
|
|
1456
|
-
} else {
|
|
1457
|
-
log.err("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
|
|
1458
|
-
quorum.sector.checksum,
|
|
1459
|
-
quorum.sector.parent,
|
|
1460
|
-
quorum.sector.sequence,
|
|
1461
|
-
quorum.count.count(),
|
|
1462
|
-
quorum.valid,
|
|
1463
|
-
});
|
|
1464
|
-
}
|
|
1465
|
-
}
|
|
1466
|
-
|
|
1467
|
-
// No working copies of any sequence number exist in the superblock storage zone at all.
|
|
1468
|
-
if (quorums.slice().len == 0) return error.NotFound;
|
|
1469
|
-
|
|
1470
|
-
// At least one copy or quorum exists.
|
|
1471
|
-
const b = quorums.slice()[0];
|
|
1472
|
-
|
|
1473
|
-
// Verify that the remaining quorums are correctly sorted:
|
|
1474
|
-
for (quorums.slice()[1..]) |a| {
|
|
1475
|
-
assert(sort_priority_descending({}, b, a));
|
|
1476
|
-
assert(a.sector.magic == .superblock);
|
|
1477
|
-
assert(a.sector.valid_checksum());
|
|
1478
|
-
}
|
|
1479
|
-
|
|
1480
|
-
// Even the best copy with the most quorum still has inadequate quorum.
|
|
1481
|
-
if (!b.valid) return error.QuorumLost;
|
|
1482
|
-
|
|
1483
|
-
// The superblock is only partially formatted, not all copies were written.
|
|
1484
|
-
if (b.sector.sequence < 2) return error.NotFound;
|
|
1485
|
-
|
|
1486
|
-
// Verify that the parent copy exists:
|
|
1487
|
-
for (quorums.slice()[1..]) |a| {
|
|
1488
|
-
if (a.sector.cluster != b.sector.cluster) {
|
|
1489
|
-
log.err("superblock copy={} has cluster={} instead of {}", .{
|
|
1490
|
-
a.sector.copy,
|
|
1491
|
-
a.sector.cluster,
|
|
1492
|
-
b.sector.cluster,
|
|
1493
|
-
});
|
|
1494
|
-
} else if (a.sector.replica != b.sector.replica) {
|
|
1495
|
-
log.err("superblock copy={} has replica={} instead of {}", .{
|
|
1496
|
-
a.sector.copy,
|
|
1497
|
-
a.sector.replica,
|
|
1498
|
-
b.sector.replica,
|
|
1499
|
-
});
|
|
1500
|
-
} else if (a.sector.checksum == b.sector.parent) {
|
|
1501
|
-
assert(a.sector.checksum != b.sector.checksum);
|
|
1502
|
-
assert(a.sector.cluster == b.sector.cluster);
|
|
1503
|
-
assert(a.sector.replica == b.sector.replica);
|
|
1504
|
-
|
|
1505
|
-
if (!a.valid) {
|
|
1506
|
-
return error.ParentQuorumLost;
|
|
1507
|
-
} else if (a.sector.sequence >= b.sector.sequence) {
|
|
1508
|
-
return error.SequenceNotMonotonic;
|
|
1509
|
-
} else if (a.sector.sequence % 2 == b.sector.sequence % 2) {
|
|
1510
|
-
// The parent must reside in the alternate copy to guarantee that we are able to
|
|
1511
|
-
// detect when the working quorum is lost.
|
|
1512
|
-
return error.SequenceNotMonotonic;
|
|
1513
|
-
} else if (!a.sector.vsr_state.monotonic(b.sector.vsr_state)) {
|
|
1514
|
-
return error.VSRStateNotMonotonic;
|
|
1515
|
-
} else {
|
|
1516
|
-
assert(b.sector.magic == .superblock);
|
|
1517
|
-
assert(b.sector.valid_checksum());
|
|
1518
|
-
|
|
1519
|
-
return b.sector;
|
|
1520
|
-
}
|
|
1521
|
-
}
|
|
1522
|
-
}
|
|
1523
|
-
|
|
1524
|
-
return error.ParentNotFound;
|
|
1452
|
+
pub const Layout = struct {
|
|
1453
|
+
pub fn offset_sector(copy: u8) u64 {
|
|
1454
|
+
assert(copy < config.superblock_copies);
|
|
1455
|
+
return superblock_copy_size * @as(u64, copy);
|
|
1525
1456
|
}
|
|
1526
1457
|
|
|
1527
|
-
fn
|
|
1528
|
-
|
|
1529
|
-
copy
|
|
1530
|
-
index: usize,
|
|
1531
|
-
threshold: u8,
|
|
1532
|
-
) void {
|
|
1533
|
-
assert(index < superblock_copies_max);
|
|
1534
|
-
assert(threshold >= 2 and threshold <= 5);
|
|
1535
|
-
|
|
1536
|
-
if (!copy.valid_checksum()) {
|
|
1537
|
-
log.debug("copy: {}/{}: invalid checksum", .{ index, superblock_copies_max });
|
|
1538
|
-
return;
|
|
1539
|
-
}
|
|
1540
|
-
|
|
1541
|
-
if (copy.magic != .superblock) {
|
|
1542
|
-
log.debug("copy: {}/{}: not a superblock", .{ index, superblock_copies_max });
|
|
1543
|
-
return;
|
|
1544
|
-
}
|
|
1545
|
-
|
|
1546
|
-
if (copy.copy == index) {
|
|
1547
|
-
log.debug("copy: {}/{}: checksum={x} parent={x} sequence={}", .{
|
|
1548
|
-
index,
|
|
1549
|
-
superblock_copies_max,
|
|
1550
|
-
copy.checksum,
|
|
1551
|
-
copy.parent,
|
|
1552
|
-
copy.sequence,
|
|
1553
|
-
});
|
|
1554
|
-
} else {
|
|
1555
|
-
// If our read was misdirected, we definitely still want to count the copy.
|
|
1556
|
-
// We must just be careful to count it idempotently.
|
|
1557
|
-
log.err(
|
|
1558
|
-
"copy: {}/{}: checksum={x} parent={x} sequence={} misdirected from copy={}",
|
|
1559
|
-
.{
|
|
1560
|
-
index,
|
|
1561
|
-
superblock_copies_max,
|
|
1562
|
-
copy.checksum,
|
|
1563
|
-
copy.parent,
|
|
1564
|
-
copy.sequence,
|
|
1565
|
-
copy.copy,
|
|
1566
|
-
},
|
|
1567
|
-
);
|
|
1568
|
-
}
|
|
1569
|
-
|
|
1570
|
-
var quorum = quorums.find_or_insert_quorum_for_copy(copy);
|
|
1571
|
-
assert(quorum.sector.checksum == copy.checksum);
|
|
1572
|
-
assert(quorum.sector.equal(copy));
|
|
1573
|
-
|
|
1574
|
-
quorum.count.set(copy.copy);
|
|
1575
|
-
assert(quorum.count.isSet(copy.copy));
|
|
1576
|
-
|
|
1577
|
-
// In the worst case, all copies may contain divergent forks of the same sequence.
|
|
1578
|
-
// However, this should not happen for the same checksum.
|
|
1579
|
-
assert(quorum.count.count() <= config.superblock_copies);
|
|
1580
|
-
|
|
1581
|
-
quorum.valid = quorum.count.count() >= threshold;
|
|
1582
|
-
}
|
|
1583
|
-
|
|
1584
|
-
fn find_or_insert_quorum_for_copy(quorums: *Quorums, copy: *const SuperBlockSector) *Quorum {
|
|
1585
|
-
assert(copy.magic == .superblock);
|
|
1586
|
-
assert(copy.valid_checksum());
|
|
1587
|
-
|
|
1588
|
-
for (quorums.array[0..quorums.count]) |*quorum| {
|
|
1589
|
-
if (copy.checksum == quorum.sector.checksum) return quorum;
|
|
1590
|
-
} else {
|
|
1591
|
-
quorums.array[quorums.count] = Quorum{ .sector = copy };
|
|
1592
|
-
quorums.count += 1;
|
|
1593
|
-
|
|
1594
|
-
return &quorums.array[quorums.count - 1];
|
|
1595
|
-
}
|
|
1458
|
+
pub fn offset_manifest(copy: u8) u64 {
|
|
1459
|
+
assert(copy < config.superblock_copies);
|
|
1460
|
+
return offset_sector(copy) + @sizeOf(SuperBlockSector);
|
|
1596
1461
|
}
|
|
1597
1462
|
|
|
1598
|
-
fn
|
|
1599
|
-
|
|
1463
|
+
pub fn offset_free_set(copy: u8) u64 {
|
|
1464
|
+
assert(copy < config.superblock_copies);
|
|
1465
|
+
return offset_manifest(copy) + superblock_trailer_manifest_size_max;
|
|
1600
1466
|
}
|
|
1601
1467
|
|
|
1602
|
-
fn
|
|
1603
|
-
assert(
|
|
1604
|
-
|
|
1605
|
-
assert(b.sector.magic == .superblock);
|
|
1606
|
-
|
|
1607
|
-
if (a.valid and !b.valid) return true;
|
|
1608
|
-
if (b.valid and !a.valid) return false;
|
|
1609
|
-
|
|
1610
|
-
if (a.sector.sequence > b.sector.sequence) return true;
|
|
1611
|
-
if (b.sector.sequence > a.sector.sequence) return false;
|
|
1612
|
-
|
|
1613
|
-
if (a.count.count() > b.count.count()) return true;
|
|
1614
|
-
if (b.count.count() > a.count.count()) return false;
|
|
1615
|
-
|
|
1616
|
-
// The sort order must be stable and deterministic:
|
|
1617
|
-
return a.sector.checksum > b.sector.checksum;
|
|
1468
|
+
pub fn offset_client_table(copy: u8) u64 {
|
|
1469
|
+
assert(copy < config.superblock_copies);
|
|
1470
|
+
return offset_free_set(copy) + superblock_trailer_free_set_size_max;
|
|
1618
1471
|
}
|
|
1619
1472
|
};
|
|
1620
1473
|
|
|
@@ -1633,111 +1486,3 @@ test "SuperBlockSector" {
|
|
|
1633
1486
|
a.replica += 1;
|
|
1634
1487
|
try expect(!a.valid_checksum());
|
|
1635
1488
|
}
|
|
1636
|
-
|
|
1637
|
-
// TODO Add unit tests for Quorums.
|
|
1638
|
-
// TODO Test invariants and transitions across TestRunner functions.
|
|
1639
|
-
// TODO Add a pristine in-memory test storage shim (we currently use real disk).
|
|
1640
|
-
const TestStorage = @import("../storage.zig").Storage;
|
|
1641
|
-
const TestSuperBlock = SuperBlockType(TestStorage);
|
|
1642
|
-
|
|
1643
|
-
const TestRunner = struct {
|
|
1644
|
-
superblock: *TestSuperBlock,
|
|
1645
|
-
context_format: TestSuperBlock.Context = undefined,
|
|
1646
|
-
context_open: TestSuperBlock.Context = undefined,
|
|
1647
|
-
context_checkpoint: TestSuperBlock.Context = undefined,
|
|
1648
|
-
context_view_change: TestSuperBlock.Context = undefined,
|
|
1649
|
-
pending: usize = 0,
|
|
1650
|
-
|
|
1651
|
-
fn format(runner: *TestRunner, options: TestSuperBlock.FormatOptions) void {
|
|
1652
|
-
runner.pending += 1;
|
|
1653
|
-
runner.superblock.format(format_callback, &runner.context_format, options);
|
|
1654
|
-
}
|
|
1655
|
-
|
|
1656
|
-
fn format_callback(context: *TestSuperBlock.Context) void {
|
|
1657
|
-
const runner = @fieldParentPtr(TestRunner, "context_format", context);
|
|
1658
|
-
runner.pending -= 1;
|
|
1659
|
-
runner.open();
|
|
1660
|
-
}
|
|
1661
|
-
|
|
1662
|
-
fn open(runner: *TestRunner) void {
|
|
1663
|
-
runner.pending += 1;
|
|
1664
|
-
runner.superblock.open(open_callback, &runner.context_open);
|
|
1665
|
-
}
|
|
1666
|
-
|
|
1667
|
-
fn open_callback(context: *TestSuperBlock.Context) void {
|
|
1668
|
-
const runner = @fieldParentPtr(TestRunner, "context_open", context);
|
|
1669
|
-
runner.pending -= 1;
|
|
1670
|
-
runner.checkpoint();
|
|
1671
|
-
runner.view_change();
|
|
1672
|
-
}
|
|
1673
|
-
|
|
1674
|
-
fn view_change(runner: *TestRunner) void {
|
|
1675
|
-
runner.pending += 1;
|
|
1676
|
-
runner.superblock.view_change(
|
|
1677
|
-
view_change_callback,
|
|
1678
|
-
&runner.context_view_change,
|
|
1679
|
-
.{
|
|
1680
|
-
.commit_min = runner.superblock.working.vsr_state.commit_min + 1,
|
|
1681
|
-
.commit_max = runner.superblock.working.vsr_state.commit_max + 2,
|
|
1682
|
-
.view_normal = runner.superblock.working.vsr_state.view_normal + 3,
|
|
1683
|
-
.view = runner.superblock.working.vsr_state.view + 4,
|
|
1684
|
-
},
|
|
1685
|
-
);
|
|
1686
|
-
}
|
|
1687
|
-
|
|
1688
|
-
fn view_change_callback(context: *TestSuperBlock.Context) void {
|
|
1689
|
-
const runner = @fieldParentPtr(TestRunner, "context_view_change", context);
|
|
1690
|
-
runner.pending -= 1;
|
|
1691
|
-
runner.checkpoint();
|
|
1692
|
-
}
|
|
1693
|
-
|
|
1694
|
-
fn checkpoint(runner: *TestRunner) void {
|
|
1695
|
-
runner.pending += 1;
|
|
1696
|
-
runner.superblock.checkpoint(checkpoint_callback, &runner.context_checkpoint);
|
|
1697
|
-
}
|
|
1698
|
-
|
|
1699
|
-
fn checkpoint_callback(context: *TestSuperBlock.Context) void {
|
|
1700
|
-
const runner = @fieldParentPtr(TestRunner, "context_checkpoint", context);
|
|
1701
|
-
runner.pending -= 1;
|
|
1702
|
-
}
|
|
1703
|
-
};
|
|
1704
|
-
|
|
1705
|
-
pub fn main() !void {
|
|
1706
|
-
const testing = std.testing;
|
|
1707
|
-
const allocator = testing.allocator;
|
|
1708
|
-
|
|
1709
|
-
const IO = @import("../io.zig").IO;
|
|
1710
|
-
const Storage = @import("../storage.zig").Storage;
|
|
1711
|
-
|
|
1712
|
-
const dir_path = ".";
|
|
1713
|
-
const dir_fd = os.openZ(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0) catch |err| {
|
|
1714
|
-
std.debug.print("failed to open directory '{s}': {}", .{ dir_path, err });
|
|
1715
|
-
return;
|
|
1716
|
-
};
|
|
1717
|
-
|
|
1718
|
-
const cluster = 32;
|
|
1719
|
-
const replica = 4;
|
|
1720
|
-
const size_max = 512 * 1024 * 1024;
|
|
1721
|
-
|
|
1722
|
-
const storage_fd = try Storage.open(dir_fd, "test_superblock", size_max, true);
|
|
1723
|
-
defer std.fs.cwd().deleteFile("test_superblock") catch {};
|
|
1724
|
-
|
|
1725
|
-
var io = try IO.init(128, 0);
|
|
1726
|
-
defer io.deinit();
|
|
1727
|
-
|
|
1728
|
-
var storage = try Storage.init(&io, size_max, storage_fd);
|
|
1729
|
-
defer storage.deinit();
|
|
1730
|
-
|
|
1731
|
-
var superblock = try TestSuperBlock.init(allocator, &storage);
|
|
1732
|
-
defer superblock.deinit(allocator);
|
|
1733
|
-
|
|
1734
|
-
var runner = TestRunner{ .superblock = &superblock };
|
|
1735
|
-
|
|
1736
|
-
runner.format(.{
|
|
1737
|
-
.cluster = cluster,
|
|
1738
|
-
.replica = replica,
|
|
1739
|
-
.size_max = size_max,
|
|
1740
|
-
});
|
|
1741
|
-
|
|
1742
|
-
while (runner.pending > 0) try io.run_for_ns(100);
|
|
1743
|
-
}
|