tigerbeetle-node 0.11.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -3
- package/src/tigerbeetle/scripts/fuzz_loop.sh +1 -1
- package/src/tigerbeetle/scripts/pre-commit.sh +2 -2
- package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
- package/src/tigerbeetle/src/benchmark.zig +25 -11
- package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
- package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
- package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -256
- package/src/tigerbeetle/src/c/tb_client.h +18 -4
- package/src/tigerbeetle/src/c/tb_client.zig +88 -26
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
- package/src/tigerbeetle/src/c/test.zig +371 -1
- package/src/tigerbeetle/src/cli.zig +36 -6
- package/src/tigerbeetle/src/config.zig +10 -1
- package/src/tigerbeetle/src/demo.zig +2 -1
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
- package/src/tigerbeetle/src/ewah.zig +11 -33
- package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
- package/src/tigerbeetle/src/lsm/README.md +97 -3
- package/src/tigerbeetle/src/lsm/compaction.zig +32 -7
- package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +34 -32
- package/src/tigerbeetle/src/lsm/grid.zig +39 -21
- package/src/tigerbeetle/src/lsm/groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +3 -3
- package/src/tigerbeetle/src/lsm/level_iterator.zig +1 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +13 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -49
- package/src/tigerbeetle/src/lsm/manifest_log.zig +173 -335
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +24 -15
- package/src/tigerbeetle/src/lsm/table.zig +32 -20
- package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/table_iterator.zig +4 -5
- package/src/tigerbeetle/src/lsm/test.zig +13 -2
- package/src/tigerbeetle/src/lsm/tree.zig +45 -7
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +36 -32
- package/src/tigerbeetle/src/main.zig +55 -2
- package/src/tigerbeetle/src/message_bus.zig +18 -7
- package/src/tigerbeetle/src/message_pool.zig +8 -2
- package/src/tigerbeetle/src/ring_buffer.zig +7 -3
- package/src/tigerbeetle/src/simulator.zig +38 -11
- package/src/tigerbeetle/src/state_machine.zig +47 -22
- package/src/tigerbeetle/src/test/accounting/workload.zig +9 -5
- package/src/tigerbeetle/src/test/cluster.zig +15 -33
- package/src/tigerbeetle/src/test/conductor.zig +2 -1
- package/src/tigerbeetle/src/test/network.zig +45 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
- package/src/tigerbeetle/src/test/state_checker.zig +5 -7
- package/src/tigerbeetle/src/test/storage.zig +453 -110
- package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +1 -0
- package/src/tigerbeetle/src/unit_tests.zig +6 -1
- package/src/tigerbeetle/src/util.zig +97 -11
- package/src/tigerbeetle/src/vopr.zig +2 -1
- package/src/tigerbeetle/src/vsr/client.zig +8 -3
- package/src/tigerbeetle/src/vsr/journal.zig +280 -202
- package/src/tigerbeetle/src/vsr/replica.zig +169 -31
- package/src/tigerbeetle/src/vsr/superblock.zig +356 -629
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -6
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +414 -151
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +44 -9
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
- package/src/tigerbeetle/src/vsr.zig +19 -5
- package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
- package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
- package/src/tigerbeetle/src/vopr_hub/README.md +0 -58
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +0 -199
- package/src/tigerbeetle/src/vopr_hub/go.mod +0 -3
- package/src/tigerbeetle/src/vopr_hub/main.go +0 -1022
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +0 -3
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +0 -403
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
//! SuperBlock invariants:
|
|
2
|
+
//!
|
|
3
|
+
//! * vsr_state
|
|
4
|
+
//! - vsr_state.commit_min is initially 0 (for a newly-formatted replica).
|
|
5
|
+
//! - vsr_state.commit_min ≤ vsr_state.commit_max
|
|
6
|
+
//! - vsr_state.view_normal ≤ vsr_state.view
|
|
7
|
+
//! - checkpoint() must advance the superblock's vsr_state.commit_min.
|
|
8
|
+
//! - view_change() must not advance the superblock's vsr_state.commit_min.
|
|
9
|
+
//! - All fields of vsr_state except commit_min_checksum are monotonically increasing over
|
|
10
|
+
//! view_change()/checkpoint().
|
|
11
|
+
//!
|
|
1
12
|
const std = @import("std");
|
|
2
13
|
const assert = std.debug.assert;
|
|
3
14
|
const crypto = std.crypto;
|
|
@@ -15,18 +26,11 @@ const MessagePool = @import("../message_pool.zig").MessagePool;
|
|
|
15
26
|
pub const SuperBlockManifest = @import("superblock_manifest.zig").Manifest;
|
|
16
27
|
pub const SuperBlockFreeSet = @import("superblock_free_set.zig").FreeSet;
|
|
17
28
|
pub const SuperBlockClientTable = @import("superblock_client_table.zig").ClientTable;
|
|
29
|
+
pub const Quorums = @import("superblock_quorums.zig").QuorumsType(.{
|
|
30
|
+
.superblock_copies = config.superblock_copies,
|
|
31
|
+
});
|
|
18
32
|
|
|
19
|
-
|
|
20
|
-
pub const Magic = enum(u8) {
|
|
21
|
-
superblock,
|
|
22
|
-
manifest,
|
|
23
|
-
prepare,
|
|
24
|
-
index,
|
|
25
|
-
filter,
|
|
26
|
-
data,
|
|
27
|
-
};
|
|
28
|
-
|
|
29
|
-
pub const SuperBlockVersion: u8 = 0;
|
|
33
|
+
pub const SuperBlockVersion: u16 = 0;
|
|
30
34
|
|
|
31
35
|
// Fields are aligned to work as an extern or packed struct.
|
|
32
36
|
pub const SuperBlockSector = extern struct {
|
|
@@ -38,14 +42,13 @@ pub const SuperBlockSector = extern struct {
|
|
|
38
42
|
/// This simplifies writing and comparing multiple copies.
|
|
39
43
|
copy: u8 = 0,
|
|
40
44
|
|
|
41
|
-
/// Protects against
|
|
42
|
-
|
|
45
|
+
/// Protects against writing to or reading from the wrong data file.
|
|
46
|
+
replica: u8,
|
|
43
47
|
|
|
44
48
|
/// The version of the superblock format in use, reserved for major breaking changes.
|
|
45
|
-
version:
|
|
49
|
+
version: u16,
|
|
46
50
|
|
|
47
51
|
/// Protects against writing to or reading from the wrong data file.
|
|
48
|
-
replica: u8,
|
|
49
52
|
cluster: u32,
|
|
50
53
|
|
|
51
54
|
/// The current size of the data file.
|
|
@@ -93,9 +96,12 @@ pub const SuperBlockSector = extern struct {
|
|
|
93
96
|
/// The size of the client table entries stored in the superblock trailer.
|
|
94
97
|
client_table_size: u32,
|
|
95
98
|
|
|
96
|
-
reserved: [
|
|
99
|
+
reserved: [3148]u8 = [_]u8{0} ** 3148,
|
|
97
100
|
|
|
98
101
|
pub const VSRState = extern struct {
|
|
102
|
+
/// The vsr.Header.checksum of commit_min's message.
|
|
103
|
+
commit_min_checksum: u128,
|
|
104
|
+
|
|
99
105
|
/// The last operation committed to the state machine. At startup, replay the log hereafter.
|
|
100
106
|
commit_min: u64,
|
|
101
107
|
|
|
@@ -108,12 +114,24 @@ pub const SuperBlockSector = extern struct {
|
|
|
108
114
|
/// The view number of the replica.
|
|
109
115
|
view: u32,
|
|
110
116
|
|
|
117
|
+
reserved: [8]u8 = [_]u8{0} ** 8,
|
|
118
|
+
|
|
111
119
|
comptime {
|
|
112
|
-
assert(@sizeOf(VSRState) ==
|
|
120
|
+
assert(@sizeOf(VSRState) == 48);
|
|
113
121
|
// Assert that there is no implicit padding in the struct.
|
|
114
122
|
assert(@bitSizeOf(VSRState) == @sizeOf(VSRState) * 8);
|
|
115
123
|
}
|
|
116
124
|
|
|
125
|
+
pub fn root(cluster: u32) VSRState {
|
|
126
|
+
return .{
|
|
127
|
+
.commit_min_checksum = vsr.Header.root_prepare(cluster).checksum,
|
|
128
|
+
.commit_min = 0,
|
|
129
|
+
.commit_max = 0,
|
|
130
|
+
.view_normal = 0,
|
|
131
|
+
.view = 0,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
117
135
|
pub fn internally_consistent(state: VSRState) bool {
|
|
118
136
|
return state.commit_max >= state.commit_min and state.view >= state.view_normal;
|
|
119
137
|
}
|
|
@@ -121,6 +139,10 @@ pub const SuperBlockSector = extern struct {
|
|
|
121
139
|
pub fn monotonic(old: VSRState, new: VSRState) bool {
|
|
122
140
|
assert(old.internally_consistent());
|
|
123
141
|
assert(new.internally_consistent());
|
|
142
|
+
// The last case is for when checking monotonic() from the sequence=0 sector.
|
|
143
|
+
assert(old.commit_min != new.commit_min or
|
|
144
|
+
old.commit_min_checksum == new.commit_min_checksum or
|
|
145
|
+
(old.commit_min_checksum == 0 and old.commit_min == 0));
|
|
124
146
|
|
|
125
147
|
if (old.view > new.view) return false;
|
|
126
148
|
if (old.view_normal > new.view_normal) return false;
|
|
@@ -147,6 +169,7 @@ pub const SuperBlockSector = extern struct {
|
|
|
147
169
|
/// But the corresponding `compact()` updates were preserved, and must not be repeated
|
|
148
170
|
/// to ensure determinstic storage.
|
|
149
171
|
pub fn op_compacted(state: VSRState, op: u64) bool {
|
|
172
|
+
// If commit_min is 0, we have never checkpointed, so no compactions are checkpointed.
|
|
150
173
|
return state.commit_min > 0 and op <= state.commit_min + config.lsm_batch_multiple;
|
|
151
174
|
}
|
|
152
175
|
};
|
|
@@ -202,13 +225,13 @@ pub const SuperBlockSector = extern struct {
|
|
|
202
225
|
}
|
|
203
226
|
|
|
204
227
|
pub fn set_checksum(superblock: *SuperBlockSector) void {
|
|
205
|
-
assert(superblock.copy <
|
|
206
|
-
assert(superblock.magic == .superblock);
|
|
228
|
+
assert(superblock.copy < config.superblock_copies);
|
|
207
229
|
assert(superblock.version == SuperBlockVersion);
|
|
208
230
|
assert(superblock.flags == 0);
|
|
209
231
|
|
|
210
232
|
assert(@bitCast(u32, superblock.reserved[0..4].*) == 0);
|
|
211
233
|
for (mem.bytesAsSlice(u64, superblock.reserved[4..])) |word| assert(word == 0);
|
|
234
|
+
for (mem.bytesAsSlice(u64, &superblock.vsr_state.reserved)) |word| assert(word == 0);
|
|
212
235
|
|
|
213
236
|
superblock.checksum = superblock.calculate_checksum();
|
|
214
237
|
}
|
|
@@ -219,9 +242,6 @@ pub const SuperBlockSector = extern struct {
|
|
|
219
242
|
|
|
220
243
|
/// Does not consider { checksum, copy } when comparing equality.
|
|
221
244
|
pub fn equal(a: *const SuperBlockSector, b: *const SuperBlockSector) bool {
|
|
222
|
-
assert(a.magic == .superblock);
|
|
223
|
-
assert(b.magic == .superblock);
|
|
224
|
-
|
|
225
245
|
if (a.version != b.version) return false;
|
|
226
246
|
if (a.replica != b.replica) return false;
|
|
227
247
|
if (a.cluster != b.cluster) return false;
|
|
@@ -243,6 +263,9 @@ pub const SuperBlockSector = extern struct {
|
|
|
243
263
|
for (mem.bytesAsSlice(u64, a.reserved[4..])) |word| assert(word == 0);
|
|
244
264
|
for (mem.bytesAsSlice(u64, b.reserved[4..])) |word| assert(word == 0);
|
|
245
265
|
|
|
266
|
+
for (mem.bytesAsSlice(u64, &a.vsr_state.reserved)) |word| assert(word == 0);
|
|
267
|
+
for (mem.bytesAsSlice(u64, &b.vsr_state.reserved)) |word| assert(word == 0);
|
|
268
|
+
|
|
246
269
|
return true;
|
|
247
270
|
}
|
|
248
271
|
};
|
|
@@ -255,20 +278,12 @@ comptime {
|
|
|
255
278
|
}
|
|
256
279
|
|
|
257
280
|
/// The size of the entire superblock storage zone.
|
|
258
|
-
pub const superblock_zone_size =
|
|
259
|
-
|
|
260
|
-
/// A single set of copies (a copy set) consists of config.superblock_copies of a superblock.
|
|
261
|
-
/// At least two copy sets are required for copy-on-write in order not to impair existing copies.
|
|
262
|
-
///
|
|
263
|
-
/// However, when writing only the superblock sector for a view change, we do update-in-place,
|
|
264
|
-
/// which is necessary as we need to continue to reference the existing superblock trailer to
|
|
265
|
-
/// decouple view changes from checkpoints, to not force an untimely checkpoint ahead of schedule.
|
|
266
|
-
pub const superblock_copies_max = config.superblock_copies * 2;
|
|
281
|
+
pub const superblock_zone_size = superblock_copy_size * config.superblock_copies;
|
|
267
282
|
|
|
268
283
|
/// The size of an individual superblock including trailer.
|
|
269
|
-
pub const
|
|
284
|
+
pub const superblock_copy_size = @sizeOf(SuperBlockSector) + superblock_trailer_size_max;
|
|
270
285
|
comptime {
|
|
271
|
-
assert(
|
|
286
|
+
assert(superblock_copy_size % config.sector_size == 0);
|
|
272
287
|
}
|
|
273
288
|
|
|
274
289
|
/// The maximum possible size of the superblock trailer, following the superblock sector.
|
|
@@ -289,7 +304,9 @@ pub const superblock_trailer_size_max = blk: {
|
|
|
289
304
|
|
|
290
305
|
// We order the smaller manifest section ahead of the block free set for better access locality.
|
|
291
306
|
// For example, it's cheaper to skip over 1 MiB when reading from disk than to skip over 32 MiB.
|
|
292
|
-
break :blk superblock_trailer_manifest_size_max +
|
|
307
|
+
break :blk superblock_trailer_manifest_size_max +
|
|
308
|
+
superblock_trailer_free_set_size_max +
|
|
309
|
+
superblock_trailer_client_table_size_max;
|
|
293
310
|
};
|
|
294
311
|
|
|
295
312
|
// A manifest block reference of 40 bytes contains a tree hash, checksum, and address.
|
|
@@ -299,6 +316,7 @@ pub const superblock_trailer_manifest_size_max = blk: {
|
|
|
299
316
|
|
|
300
317
|
// Use a multiple of sector * reference so that the size is exactly divisible without padding:
|
|
301
318
|
// For example, this 2.5 MiB manifest trailer == 65536 references == 65536 * 511 or 34m tables.
|
|
319
|
+
// TODO Size this relative to the expected number of tables & fragmentation.
|
|
302
320
|
break :blk 16 * config.sector_size * SuperBlockManifest.BlockReferenceSize;
|
|
303
321
|
};
|
|
304
322
|
|
|
@@ -322,6 +340,33 @@ pub const data_file_size_min = blk: {
|
|
|
322
340
|
break :blk superblock_zone_size + config.journal_size_max;
|
|
323
341
|
};
|
|
324
342
|
|
|
343
|
+
/// This table shows the sequence number progression of the SuperBlock's sectors.
|
|
344
|
+
///
|
|
345
|
+
/// action working staging disk
|
|
346
|
+
/// format seq seq seq
|
|
347
|
+
/// 0 - Initially the file has no sectors.
|
|
348
|
+
/// 0 1 -
|
|
349
|
+
/// 0 1 1 Write a copyset for the first sequence.
|
|
350
|
+
/// 1 1 1 Read quorum; verify 3/4 are valid.
|
|
351
|
+
///
|
|
352
|
+
/// open seq seq seq
|
|
353
|
+
/// a
|
|
354
|
+
/// a a Read quorum; verify 2/4 are valid.
|
|
355
|
+
/// a (a) a Repair any broken copies of `a`.
|
|
356
|
+
///
|
|
357
|
+
/// checkpoint seq seq seq
|
|
358
|
+
/// a a a
|
|
359
|
+
/// a a+1
|
|
360
|
+
/// a a+1 a+1
|
|
361
|
+
/// a+1 a+1 a+1 Read quorum; verify 3/4 are valid.
|
|
362
|
+
///
|
|
363
|
+
/// view_change seq seq seq
|
|
364
|
+
/// a a
|
|
365
|
+
/// a a+1 a The new sequence reuses the original parent.
|
|
366
|
+
/// a a+1 a+1
|
|
367
|
+
/// a+1 a+1 a+1 Read quorum; verify 3/4 are valid.
|
|
368
|
+
/// working staging disk
|
|
369
|
+
///
|
|
325
370
|
pub fn SuperBlockType(comptime Storage: type) type {
|
|
326
371
|
return struct {
|
|
327
372
|
const SuperBlock = @This();
|
|
@@ -344,8 +389,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
344
389
|
|
|
345
390
|
write: Storage.Write = undefined,
|
|
346
391
|
read: Storage.Read = undefined,
|
|
347
|
-
|
|
348
|
-
|
|
392
|
+
read_threshold: ?Quorums.Threshold = null,
|
|
393
|
+
copy: ?u8 = null,
|
|
394
|
+
/// Used by format(), checkpoint(), and view_change().
|
|
395
|
+
vsr_state: ?SuperBlockSector.VSRState = null,
|
|
396
|
+
repairs: ?Quorums.RepairIterator = null, // Used by open().
|
|
349
397
|
};
|
|
350
398
|
|
|
351
399
|
storage: *Storage,
|
|
@@ -360,14 +408,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
360
408
|
working: *align(config.sector_size) SuperBlockSector,
|
|
361
409
|
|
|
362
410
|
/// The superblock that will replace the current working superblock once written.
|
|
363
|
-
/// This is used when writing the staging superblock, or when changing views before then.
|
|
364
411
|
/// We cannot mutate any working state directly until it is safely on stable storage.
|
|
365
412
|
/// Otherwise, we may accidentally externalize guarantees that are not yet durable.
|
|
366
|
-
writing: *align(config.sector_size) SuperBlockSector,
|
|
367
|
-
|
|
368
|
-
/// The superblock that will be checkpointed next.
|
|
369
|
-
/// This may be updated incrementally several times before the next checkpoint.
|
|
370
|
-
/// For example, to track new snapshots as they are registered.
|
|
371
413
|
staging: *align(config.sector_size) SuperBlockSector,
|
|
372
414
|
|
|
373
415
|
/// The copies that we read into at startup or when verifying the written superblock.
|
|
@@ -421,11 +463,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
421
463
|
const b = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
|
|
422
464
|
errdefer allocator.free(b);
|
|
423
465
|
|
|
424
|
-
const c = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
|
|
425
|
-
errdefer allocator.free(c);
|
|
426
|
-
|
|
427
466
|
const reading = try allocator.allocAdvanced(
|
|
428
|
-
[config.superblock_copies
|
|
467
|
+
[config.superblock_copies]SuperBlockSector,
|
|
429
468
|
config.sector_size,
|
|
430
469
|
1,
|
|
431
470
|
.exact,
|
|
@@ -475,8 +514,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
475
514
|
return SuperBlock{
|
|
476
515
|
.storage = storage,
|
|
477
516
|
.working = &a[0],
|
|
478
|
-
.
|
|
479
|
-
.staging = &c[0],
|
|
517
|
+
.staging = &b[0],
|
|
480
518
|
.reading = &reading[0],
|
|
481
519
|
.manifest = manifest,
|
|
482
520
|
.free_set = free_set,
|
|
@@ -489,7 +527,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
489
527
|
|
|
490
528
|
pub fn deinit(superblock: *SuperBlock, allocator: mem.Allocator) void {
|
|
491
529
|
allocator.destroy(superblock.working);
|
|
492
|
-
allocator.destroy(superblock.writing);
|
|
493
530
|
allocator.destroy(superblock.staging);
|
|
494
531
|
allocator.free(superblock.reading);
|
|
495
532
|
|
|
@@ -526,7 +563,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
526
563
|
// We therefore use zero values to make this parent checksum as stable as possible.
|
|
527
564
|
superblock.working.* = .{
|
|
528
565
|
.copy = 0,
|
|
529
|
-
.magic = .superblock,
|
|
530
566
|
.version = SuperBlockVersion,
|
|
531
567
|
.sequence = 0,
|
|
532
568
|
.replica = options.replica,
|
|
@@ -538,6 +574,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
538
574
|
.free_set_checksum = 0,
|
|
539
575
|
.client_table_checksum = 0,
|
|
540
576
|
.vsr_state = .{
|
|
577
|
+
.commit_min_checksum = 0,
|
|
541
578
|
.commit_min = 0,
|
|
542
579
|
.commit_max = 0,
|
|
543
580
|
.view_normal = 0,
|
|
@@ -557,15 +594,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
557
594
|
|
|
558
595
|
superblock.working.set_checksum();
|
|
559
596
|
|
|
560
|
-
superblock.staging.* = superblock.working.*;
|
|
561
|
-
superblock.staging.sequence = superblock.working.sequence + 1;
|
|
562
|
-
superblock.staging.parent = superblock.working.checksum;
|
|
563
|
-
|
|
564
597
|
context.* = .{
|
|
565
598
|
.superblock = superblock,
|
|
566
599
|
.callback = callback,
|
|
567
600
|
.caller = .format,
|
|
568
|
-
.
|
|
601
|
+
.vsr_state = SuperBlockSector.VSRState.root(options.cluster),
|
|
569
602
|
};
|
|
570
603
|
|
|
571
604
|
// TODO At a higher layer, we must:
|
|
@@ -591,23 +624,33 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
591
624
|
superblock.acquire(context);
|
|
592
625
|
}
|
|
593
626
|
|
|
627
|
+
/// The vsr_state must update the commit_min and commit_min_checksum.
|
|
628
|
+
// TODO Will the replica ever update view/view_normal by calling checkpoint() during a view
|
|
629
|
+
// change? If not, forbid it.
|
|
594
630
|
pub fn checkpoint(
|
|
595
631
|
superblock: *SuperBlock,
|
|
596
632
|
callback: fn (context: *Context) void,
|
|
597
633
|
context: *Context,
|
|
634
|
+
vsr_state: SuperBlockSector.VSRState,
|
|
598
635
|
) void {
|
|
599
636
|
assert(superblock.opened);
|
|
637
|
+
// Checkpoint must advance commit_min, but never the view.
|
|
638
|
+
assert(superblock.staging.vsr_state.would_be_updated_by(vsr_state));
|
|
639
|
+
assert(superblock.staging.vsr_state.commit_min < vsr_state.commit_min);
|
|
640
|
+
assert(superblock.staging.vsr_state.commit_min_checksum !=
|
|
641
|
+
vsr_state.commit_min_checksum);
|
|
600
642
|
|
|
601
643
|
context.* = .{
|
|
602
644
|
.superblock = superblock,
|
|
603
645
|
.callback = callback,
|
|
604
646
|
.caller = .checkpoint,
|
|
605
|
-
.
|
|
647
|
+
.vsr_state = vsr_state,
|
|
606
648
|
};
|
|
607
649
|
|
|
608
650
|
superblock.acquire(context);
|
|
609
651
|
}
|
|
610
652
|
|
|
653
|
+
/// The vsr_state must not update the `commit_min` or `commit_min_checksum`.
|
|
611
654
|
pub fn view_change(
|
|
612
655
|
superblock: *SuperBlock,
|
|
613
656
|
callback: fn (context: *Context) void,
|
|
@@ -615,20 +658,28 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
615
658
|
vsr_state: SuperBlockSector.VSRState,
|
|
616
659
|
) void {
|
|
617
660
|
assert(superblock.opened);
|
|
661
|
+
assert(vsr_state.commit_min == superblock.staging.vsr_state.commit_min);
|
|
662
|
+
assert(vsr_state.commit_min_checksum ==
|
|
663
|
+
superblock.staging.vsr_state.commit_min_checksum);
|
|
664
|
+
assert(superblock.staging.vsr_state.monotonic(vsr_state));
|
|
618
665
|
|
|
619
666
|
log.debug(
|
|
620
|
-
"view_change:
|
|
667
|
+
"view_change: commit_min_checksum={}..{} commit_min={}..{} commit_max={}..{} " ++
|
|
668
|
+
"view_normal={}..{} view={}..{}",
|
|
621
669
|
.{
|
|
622
|
-
superblock.
|
|
670
|
+
superblock.staging.vsr_state.commit_min_checksum,
|
|
671
|
+
vsr_state.commit_min_checksum,
|
|
672
|
+
|
|
673
|
+
superblock.staging.vsr_state.commit_min,
|
|
623
674
|
vsr_state.commit_min,
|
|
624
675
|
|
|
625
|
-
superblock.
|
|
676
|
+
superblock.staging.vsr_state.commit_max,
|
|
626
677
|
vsr_state.commit_max,
|
|
627
678
|
|
|
628
|
-
superblock.
|
|
679
|
+
superblock.staging.vsr_state.view_normal,
|
|
629
680
|
vsr_state.view_normal,
|
|
630
681
|
|
|
631
|
-
superblock.
|
|
682
|
+
superblock.staging.vsr_state.view,
|
|
632
683
|
vsr_state.view,
|
|
633
684
|
},
|
|
634
685
|
);
|
|
@@ -639,14 +690,10 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
639
690
|
.superblock = superblock,
|
|
640
691
|
.callback = callback,
|
|
641
692
|
.caller = .view_change,
|
|
642
|
-
.copy = undefined,
|
|
643
693
|
.vsr_state = vsr_state,
|
|
644
694
|
};
|
|
645
695
|
|
|
646
|
-
|
|
647
|
-
assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
|
|
648
|
-
|
|
649
|
-
if (!superblock.working.vsr_state.would_be_updated_by(context.vsr_state)) {
|
|
696
|
+
if (!superblock.staging.vsr_state.would_be_updated_by(context.vsr_state.?)) {
|
|
650
697
|
log.debug("view_change: no change", .{});
|
|
651
698
|
callback(context);
|
|
652
699
|
return;
|
|
@@ -672,34 +719,32 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
672
719
|
}
|
|
673
720
|
|
|
674
721
|
fn write_staging(superblock: *SuperBlock, context: *Context) void {
|
|
675
|
-
assert(context.caller
|
|
722
|
+
assert(context.caller != .open);
|
|
676
723
|
assert(context.caller == .format or superblock.opened);
|
|
724
|
+
assert(context.copy == null);
|
|
725
|
+
assert(context.vsr_state.?.internally_consistent());
|
|
677
726
|
assert(superblock.queue_head == context);
|
|
678
727
|
assert(superblock.queue_tail == null);
|
|
728
|
+
assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state.?));
|
|
679
729
|
|
|
680
|
-
superblock.
|
|
681
|
-
superblock.
|
|
682
|
-
superblock.
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
superblock.staging.
|
|
691
|
-
superblock.staging.parent = superblock.writing.checksum;
|
|
692
|
-
|
|
693
|
-
assert(superblock.writing.manifest_checksum == superblock.staging.manifest_checksum);
|
|
694
|
-
assert(superblock.writing.free_set_checksum == superblock.staging.free_set_checksum);
|
|
695
|
-
assert(superblock.writing.client_table_checksum == superblock.staging.client_table_checksum);
|
|
696
|
-
|
|
697
|
-
assert(superblock.writing.manifest_size == superblock.staging.manifest_size);
|
|
698
|
-
assert(superblock.writing.free_set_size == superblock.staging.free_set_size);
|
|
699
|
-
assert(superblock.writing.client_table_size == superblock.staging.client_table_size);
|
|
730
|
+
superblock.staging.* = superblock.working.*;
|
|
731
|
+
superblock.staging.sequence = superblock.staging.sequence + 1;
|
|
732
|
+
superblock.staging.parent = superblock.staging.checksum;
|
|
733
|
+
superblock.staging.vsr_state.update(context.vsr_state.?);
|
|
734
|
+
|
|
735
|
+
if (context.caller != .view_change) {
|
|
736
|
+
superblock.write_staging_encode_manifest();
|
|
737
|
+
superblock.write_staging_encode_free_set();
|
|
738
|
+
superblock.write_staging_encode_client_table();
|
|
739
|
+
}
|
|
740
|
+
superblock.staging.set_checksum();
|
|
700
741
|
|
|
701
|
-
context.copy =
|
|
702
|
-
|
|
742
|
+
context.copy = 0;
|
|
743
|
+
if (context.caller == .view_change) {
|
|
744
|
+
superblock.write_sector(context);
|
|
745
|
+
} else {
|
|
746
|
+
superblock.write_manifest(context);
|
|
747
|
+
}
|
|
703
748
|
}
|
|
704
749
|
|
|
705
750
|
fn write_staging_encode_manifest(superblock: *SuperBlock) void {
|
|
@@ -740,54 +785,25 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
740
785
|
staging.client_table_checksum = vsr.checksum(target[0..staging.client_table_size]);
|
|
741
786
|
}
|
|
742
787
|
|
|
743
|
-
fn write_view_change(superblock: *SuperBlock, context: *Context) void {
|
|
744
|
-
assert(context.caller == .view_change);
|
|
745
|
-
assert(superblock.opened);
|
|
746
|
-
assert(superblock.queue_head == context);
|
|
747
|
-
assert(superblock.queue_tail == null);
|
|
748
|
-
assert(context.vsr_state.internally_consistent());
|
|
749
|
-
assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
|
|
750
|
-
assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state));
|
|
751
|
-
|
|
752
|
-
superblock.writing.* = superblock.working.*;
|
|
753
|
-
|
|
754
|
-
// We cannot increment the sequence number when writing only the superblock sector as
|
|
755
|
-
// this would write the sector to another copy set with different superblock trailers.
|
|
756
|
-
// Instead, we increment twice so that the sector remains in the same copy set.
|
|
757
|
-
superblock.writing.sequence += 2;
|
|
758
|
-
assert(superblock.writing.parent == superblock.working.parent);
|
|
759
|
-
|
|
760
|
-
superblock.writing.vsr_state.update(context.vsr_state);
|
|
761
|
-
superblock.staging.vsr_state.update(context.vsr_state);
|
|
762
|
-
|
|
763
|
-
superblock.writing.set_checksum();
|
|
764
|
-
|
|
765
|
-
superblock.staging.sequence = superblock.writing.sequence + 1;
|
|
766
|
-
superblock.staging.parent = superblock.writing.checksum;
|
|
767
|
-
|
|
768
|
-
context.copy = starting_copy_for_sequence(superblock.writing.sequence);
|
|
769
|
-
superblock.write_sector(context);
|
|
770
|
-
}
|
|
771
|
-
|
|
772
788
|
fn write_manifest(superblock: *SuperBlock, context: *Context) void {
|
|
773
789
|
assert(superblock.queue_head == context);
|
|
774
790
|
|
|
775
|
-
const size = vsr.sector_ceil(superblock.
|
|
791
|
+
const size = vsr.sector_ceil(superblock.staging.manifest_size);
|
|
776
792
|
assert(size <= superblock_trailer_manifest_size_max);
|
|
777
793
|
|
|
778
794
|
const buffer = superblock.manifest_buffer[0..size];
|
|
779
|
-
const offset = offset_manifest(context.copy
|
|
795
|
+
const offset = Layout.offset_manifest(context.copy.?);
|
|
780
796
|
|
|
781
|
-
mem.set(u8, buffer[superblock.
|
|
797
|
+
mem.set(u8, buffer[superblock.staging.manifest_size..], 0); // Zero sector padding.
|
|
782
798
|
|
|
783
|
-
assert(superblock.
|
|
784
|
-
superblock.manifest_buffer[0..superblock.
|
|
799
|
+
assert(superblock.staging.manifest_checksum == vsr.checksum(
|
|
800
|
+
superblock.manifest_buffer[0..superblock.staging.manifest_size],
|
|
785
801
|
));
|
|
786
802
|
|
|
787
803
|
log.debug("{s}: write_manifest: checksum={x} size={} offset={}", .{
|
|
788
804
|
@tagName(context.caller),
|
|
789
|
-
superblock.
|
|
790
|
-
superblock.
|
|
805
|
+
superblock.staging.manifest_checksum,
|
|
806
|
+
superblock.staging.manifest_size,
|
|
791
807
|
offset,
|
|
792
808
|
});
|
|
793
809
|
|
|
@@ -815,22 +831,22 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
815
831
|
fn write_free_set(superblock: *SuperBlock, context: *Context) void {
|
|
816
832
|
assert(superblock.queue_head == context);
|
|
817
833
|
|
|
818
|
-
const size = vsr.sector_ceil(superblock.
|
|
834
|
+
const size = vsr.sector_ceil(superblock.staging.free_set_size);
|
|
819
835
|
assert(size <= superblock_trailer_free_set_size_max);
|
|
820
836
|
|
|
821
837
|
const buffer = superblock.free_set_buffer[0..size];
|
|
822
|
-
const offset = offset_free_set(context.copy
|
|
838
|
+
const offset = Layout.offset_free_set(context.copy.?);
|
|
823
839
|
|
|
824
|
-
mem.set(u8, buffer[superblock.
|
|
840
|
+
mem.set(u8, buffer[superblock.staging.free_set_size..], 0); // Zero sector padding.
|
|
825
841
|
|
|
826
|
-
assert(superblock.
|
|
827
|
-
superblock.free_set_buffer[0..superblock.
|
|
842
|
+
assert(superblock.staging.free_set_checksum == vsr.checksum(
|
|
843
|
+
superblock.free_set_buffer[0..superblock.staging.free_set_size],
|
|
828
844
|
));
|
|
829
845
|
|
|
830
846
|
log.debug("{s}: write_free_set: checksum={x} size={} offset={}", .{
|
|
831
847
|
@tagName(context.caller),
|
|
832
|
-
superblock.
|
|
833
|
-
superblock.
|
|
848
|
+
superblock.staging.free_set_checksum,
|
|
849
|
+
superblock.staging.free_set_size,
|
|
834
850
|
offset,
|
|
835
851
|
});
|
|
836
852
|
|
|
@@ -858,22 +874,22 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
858
874
|
fn write_client_table(superblock: *SuperBlock, context: *Context) void {
|
|
859
875
|
assert(superblock.queue_head == context);
|
|
860
876
|
|
|
861
|
-
const size = vsr.sector_ceil(superblock.
|
|
877
|
+
const size = vsr.sector_ceil(superblock.staging.client_table_size);
|
|
862
878
|
assert(size <= superblock_trailer_client_table_size_max);
|
|
863
879
|
|
|
864
880
|
const buffer = superblock.client_table_buffer[0..size];
|
|
865
|
-
const offset = offset_client_table(context.copy
|
|
881
|
+
const offset = Layout.offset_client_table(context.copy.?);
|
|
866
882
|
|
|
867
|
-
mem.set(u8, buffer[superblock.
|
|
883
|
+
mem.set(u8, buffer[superblock.staging.client_table_size..], 0); // Zero sector padding.
|
|
868
884
|
|
|
869
|
-
assert(superblock.
|
|
870
|
-
superblock.client_table_buffer[0..superblock.
|
|
885
|
+
assert(superblock.staging.client_table_checksum == vsr.checksum(
|
|
886
|
+
superblock.client_table_buffer[0..superblock.staging.client_table_size],
|
|
871
887
|
));
|
|
872
888
|
|
|
873
889
|
log.debug("{s}: write_client_table: checksum={x} size={} offset={}", .{
|
|
874
890
|
@tagName(context.caller),
|
|
875
|
-
superblock.
|
|
876
|
-
superblock.
|
|
891
|
+
superblock.staging.client_table_checksum,
|
|
892
|
+
superblock.staging.client_table_size,
|
|
877
893
|
offset,
|
|
878
894
|
});
|
|
879
895
|
|
|
@@ -901,45 +917,42 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
901
917
|
fn write_sector(superblock: *SuperBlock, context: *Context) void {
|
|
902
918
|
assert(superblock.queue_head == context);
|
|
903
919
|
|
|
904
|
-
// We
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
920
|
+
// We update the working superblock for a checkpoint/format/view_change:
|
|
921
|
+
// open() does not update the working superblock, since it only writes to repair.
|
|
922
|
+
if (context.caller == .open) {
|
|
923
|
+
assert(superblock.staging.sequence == superblock.working.sequence);
|
|
924
|
+
} else {
|
|
925
|
+
assert(superblock.staging.sequence == superblock.working.sequence + 1);
|
|
926
|
+
assert(superblock.staging.parent == superblock.working.checksum);
|
|
927
|
+
}
|
|
912
928
|
|
|
913
929
|
// The superblock cluster and replica should never change once formatted:
|
|
914
|
-
assert(superblock.
|
|
915
|
-
assert(superblock.
|
|
916
|
-
assert(superblock.writing.replica == superblock.working.replica);
|
|
917
|
-
assert(superblock.writing.replica == superblock.staging.replica);
|
|
930
|
+
assert(superblock.staging.cluster == superblock.working.cluster);
|
|
931
|
+
assert(superblock.staging.replica == superblock.working.replica);
|
|
918
932
|
|
|
919
|
-
assert(superblock.
|
|
920
|
-
assert(superblock.
|
|
933
|
+
assert(superblock.staging.size >= data_file_size_min);
|
|
934
|
+
assert(superblock.staging.size <= superblock.staging.size_max);
|
|
921
935
|
|
|
922
|
-
assert(context.copy <
|
|
923
|
-
|
|
924
|
-
assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
|
|
925
|
-
superblock.writing.copy = context.copy;
|
|
936
|
+
assert(context.copy.? < config.superblock_copies);
|
|
937
|
+
superblock.staging.copy = context.copy.?;
|
|
926
938
|
|
|
927
939
|
// Updating the copy number should not affect the checksum, which was previously set:
|
|
928
|
-
assert(superblock.
|
|
940
|
+
assert(superblock.staging.valid_checksum());
|
|
929
941
|
|
|
930
|
-
const buffer = mem.asBytes(superblock.
|
|
931
|
-
const offset =
|
|
942
|
+
const buffer = mem.asBytes(superblock.staging);
|
|
943
|
+
const offset = Layout.offset_sector(context.copy.?);
|
|
932
944
|
|
|
933
|
-
log.debug("{s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
|
|
945
|
+
log.debug("{}: {s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
|
|
946
|
+
superblock.staging.replica,
|
|
934
947
|
@tagName(context.caller),
|
|
935
|
-
superblock.
|
|
936
|
-
superblock.
|
|
937
|
-
context.copy
|
|
948
|
+
superblock.staging.checksum,
|
|
949
|
+
superblock.staging.sequence,
|
|
950
|
+
context.copy.?,
|
|
938
951
|
buffer.len,
|
|
939
952
|
offset,
|
|
940
953
|
});
|
|
941
954
|
|
|
942
|
-
superblock.assert_bounds(offset, buffer.len
|
|
955
|
+
superblock.assert_bounds(offset, buffer.len);
|
|
943
956
|
|
|
944
957
|
superblock.storage.write_sectors(
|
|
945
958
|
write_sector_callback,
|
|
@@ -953,25 +966,24 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
953
966
|
fn write_sector_callback(write: *Storage.Write) void {
|
|
954
967
|
const context = @fieldParentPtr(Context, "write", write);
|
|
955
968
|
const superblock = context.superblock;
|
|
969
|
+
const copy = context.copy.?;
|
|
956
970
|
|
|
957
971
|
assert(superblock.queue_head == context);
|
|
958
972
|
|
|
959
|
-
assert(
|
|
960
|
-
assert(
|
|
961
|
-
assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
|
|
962
|
-
assert(context.copy == superblock.writing.copy);
|
|
973
|
+
assert(copy < config.superblock_copies);
|
|
974
|
+
assert(copy == superblock.staging.copy);
|
|
963
975
|
|
|
964
|
-
if (context.
|
|
965
|
-
|
|
966
|
-
|
|
976
|
+
if (context.caller == .open) {
|
|
977
|
+
context.copy = null;
|
|
978
|
+
superblock.repair(context);
|
|
979
|
+
return;
|
|
980
|
+
}
|
|
967
981
|
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
superblock.read_working(context);
|
|
972
|
-
}
|
|
982
|
+
if (copy + 1 == config.superblock_copies) {
|
|
983
|
+
context.copy = null;
|
|
984
|
+
superblock.read_working(context, .verify);
|
|
973
985
|
} else {
|
|
974
|
-
context.copy
|
|
986
|
+
context.copy = copy + 1;
|
|
975
987
|
|
|
976
988
|
switch (context.caller) {
|
|
977
989
|
.open => unreachable,
|
|
@@ -981,34 +993,42 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
981
993
|
}
|
|
982
994
|
}
|
|
983
995
|
|
|
984
|
-
fn read_working(
|
|
996
|
+
fn read_working(
|
|
997
|
+
superblock: *SuperBlock,
|
|
998
|
+
context: *Context,
|
|
999
|
+
threshold: Quorums.Threshold,
|
|
1000
|
+
) void {
|
|
985
1001
|
assert(superblock.queue_head == context);
|
|
1002
|
+
assert(context.copy == null);
|
|
1003
|
+
assert(context.read_threshold == null);
|
|
986
1004
|
|
|
987
1005
|
// We do not submit reads in parallel, as while this would shave off 1ms, it would also
|
|
988
1006
|
// increase the risk that a single fault applies to more reads due to temporal locality.
|
|
989
1007
|
// This would make verification reads more flaky when we do experience a read fault.
|
|
990
1008
|
// See "An Analysis of Data Corruption in the Storage Stack".
|
|
991
1009
|
|
|
992
|
-
context.copy = 0;
|
|
1010
|
+
context.copy = 0;
|
|
1011
|
+
context.read_threshold = threshold;
|
|
993
1012
|
for (superblock.reading) |*copy| copy.* = undefined;
|
|
994
1013
|
superblock.read_sector(context);
|
|
995
1014
|
}
|
|
996
1015
|
|
|
997
1016
|
fn read_sector(superblock: *SuperBlock, context: *Context) void {
|
|
998
1017
|
assert(superblock.queue_head == context);
|
|
999
|
-
assert(context.copy <
|
|
1018
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1019
|
+
assert(context.read_threshold != null);
|
|
1000
1020
|
|
|
1001
|
-
const buffer = mem.asBytes(&superblock.reading[context.copy]);
|
|
1002
|
-
const offset =
|
|
1021
|
+
const buffer = mem.asBytes(&superblock.reading[context.copy.?]);
|
|
1022
|
+
const offset = Layout.offset_sector(context.copy.?);
|
|
1003
1023
|
|
|
1004
1024
|
log.debug("{s}: read_sector: copy={} size={} offset={}", .{
|
|
1005
1025
|
@tagName(context.caller),
|
|
1006
|
-
context.copy
|
|
1026
|
+
context.copy.?,
|
|
1007
1027
|
buffer.len,
|
|
1008
1028
|
offset,
|
|
1009
1029
|
});
|
|
1010
1030
|
|
|
1011
|
-
superblock.assert_bounds(offset, buffer.len
|
|
1031
|
+
superblock.assert_bounds(offset, buffer.len);
|
|
1012
1032
|
|
|
1013
1033
|
superblock.storage.read_sectors(
|
|
1014
1034
|
read_sector_callback,
|
|
@@ -1022,96 +1042,109 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1022
1042
|
fn read_sector_callback(read: *Storage.Read) void {
|
|
1023
1043
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1024
1044
|
const superblock = context.superblock;
|
|
1045
|
+
const threshold = context.read_threshold.?;
|
|
1025
1046
|
|
|
1026
1047
|
assert(superblock.queue_head == context);
|
|
1027
1048
|
|
|
1028
|
-
assert(context.copy <
|
|
1029
|
-
if (context.copy
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
},
|
|
1042
|
-
.open => {
|
|
1043
|
-
superblock.staging.* = working.*;
|
|
1044
|
-
superblock.staging.sequence = working.sequence + 1;
|
|
1045
|
-
superblock.staging.parent = working.checksum;
|
|
1046
|
-
},
|
|
1047
|
-
}
|
|
1049
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1050
|
+
if (context.copy.? + 1 != config.superblock_copies) {
|
|
1051
|
+
context.copy = context.copy.? + 1;
|
|
1052
|
+
superblock.read_sector(context);
|
|
1053
|
+
return;
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
context.read_threshold = null;
|
|
1057
|
+
context.copy = null;
|
|
1058
|
+
|
|
1059
|
+
if (superblock.quorums.working(superblock.reading, threshold)) |quorum| {
|
|
1060
|
+
assert(quorum.valid);
|
|
1061
|
+
assert(quorum.copies.count() >= threshold.count());
|
|
1048
1062
|
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
assert(working.free_set_size == 8);
|
|
1054
|
-
assert(working.vsr_state.commit_min == 0);
|
|
1055
|
-
assert(working.vsr_state.commit_max == 0);
|
|
1056
|
-
assert(working.vsr_state.view_normal == 0);
|
|
1057
|
-
assert(working.vsr_state.view == 0);
|
|
1058
|
-
} else if (context.caller == .checkpoint) {
|
|
1059
|
-
superblock.free_set.checkpoint();
|
|
1063
|
+
const working = quorum.sector;
|
|
1064
|
+
if (threshold == .verify) {
|
|
1065
|
+
if (working.checksum != superblock.staging.checksum) {
|
|
1066
|
+
@panic("superblock failed verification after writing");
|
|
1060
1067
|
}
|
|
1068
|
+
assert(working.equal(superblock.staging));
|
|
1069
|
+
}
|
|
1061
1070
|
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1071
|
+
if (context.caller == .format) {
|
|
1072
|
+
assert(working.sequence == 1);
|
|
1073
|
+
assert(working.size == data_file_size_min);
|
|
1074
|
+
assert(working.manifest_size == 0);
|
|
1075
|
+
assert(working.free_set_size == 8);
|
|
1076
|
+
assert(working.client_table_size == 4);
|
|
1077
|
+
assert(working.vsr_state.commit_min_checksum ==
|
|
1078
|
+
vsr.Header.root_prepare(working.cluster).checksum);
|
|
1079
|
+
assert(working.vsr_state.commit_min == 0);
|
|
1080
|
+
assert(working.vsr_state.commit_max == 0);
|
|
1081
|
+
assert(working.vsr_state.view_normal == 0);
|
|
1082
|
+
assert(working.vsr_state.view == 0);
|
|
1083
|
+
} else if (context.caller == .checkpoint) {
|
|
1084
|
+
superblock.free_set.checkpoint();
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
superblock.working.* = working.*;
|
|
1088
|
+
superblock.staging.* = working.*;
|
|
1089
|
+
log.debug(
|
|
1090
|
+
"{s}: installed working superblock: checksum={x} sequence={} cluster={} " ++
|
|
1091
|
+
"replica={} size={} " ++
|
|
1092
|
+
"commit_min_checksum={} commit_min={} commit_max={} " ++
|
|
1093
|
+
"view_normal={} view={}",
|
|
1094
|
+
.{
|
|
1095
|
+
@tagName(context.caller),
|
|
1096
|
+
superblock.working.checksum,
|
|
1097
|
+
superblock.working.sequence,
|
|
1098
|
+
superblock.working.cluster,
|
|
1099
|
+
superblock.working.replica,
|
|
1100
|
+
superblock.working.size,
|
|
1101
|
+
superblock.working.vsr_state.commit_min_checksum,
|
|
1102
|
+
superblock.working.vsr_state.commit_min,
|
|
1103
|
+
superblock.working.vsr_state.commit_max,
|
|
1104
|
+
superblock.working.vsr_state.view_normal,
|
|
1105
|
+
superblock.working.vsr_state.view,
|
|
1106
|
+
},
|
|
1107
|
+
);
|
|
1108
|
+
|
|
1109
|
+
if (context.caller == .open) {
|
|
1110
|
+
if (context.repairs) |_| {
|
|
1111
|
+
// We just verified that the repair completed.
|
|
1112
|
+
assert(threshold == .verify);
|
|
1085
1113
|
superblock.release(context);
|
|
1114
|
+
} else {
|
|
1115
|
+
assert(threshold == .open);
|
|
1116
|
+
context.copy = 0;
|
|
1117
|
+
context.repairs = quorum.repairs();
|
|
1118
|
+
superblock.read_manifest(context);
|
|
1086
1119
|
}
|
|
1087
|
-
} else
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
error.ParentNotFound => @panic("superblock parent not found"),
|
|
1091
|
-
error.ParentQuorumLost => @panic("superblock parent quorum lost"),
|
|
1092
|
-
error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
|
|
1093
|
-
error.SequenceNotMonotonic => @panic("superblock sequence not monotonic"),
|
|
1120
|
+
} else {
|
|
1121
|
+
// TODO Consider calling TRIM() on Grid's free suffix after checkpointing.
|
|
1122
|
+
superblock.release(context);
|
|
1094
1123
|
}
|
|
1095
|
-
} else {
|
|
1096
|
-
|
|
1097
|
-
|
|
1124
|
+
} else |err| switch (err) {
|
|
1125
|
+
error.Fork => @panic("superblock forked"),
|
|
1126
|
+
error.NotFound => @panic("superblock not found"),
|
|
1127
|
+
error.QuorumLost => @panic("superblock quorum lost"),
|
|
1128
|
+
error.ParentNotConnected => @panic("superblock parent not connected"),
|
|
1129
|
+
error.ParentSkipped => @panic("superblock parent superseded"),
|
|
1130
|
+
error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
|
|
1098
1131
|
}
|
|
1099
1132
|
}
|
|
1100
1133
|
|
|
1101
1134
|
fn read_manifest(superblock: *SuperBlock, context: *Context) void {
|
|
1102
1135
|
assert(context.caller == .open);
|
|
1103
1136
|
assert(superblock.queue_head == context);
|
|
1104
|
-
assert(context.copy <
|
|
1137
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1105
1138
|
|
|
1106
1139
|
const size = vsr.sector_ceil(superblock.working.manifest_size);
|
|
1107
1140
|
assert(size <= superblock_trailer_manifest_size_max);
|
|
1108
1141
|
|
|
1109
1142
|
const buffer = superblock.manifest_buffer[0..size];
|
|
1110
|
-
const offset = offset_manifest(context.copy
|
|
1143
|
+
const offset = Layout.offset_manifest(context.copy.?);
|
|
1111
1144
|
|
|
1112
1145
|
log.debug("{s}: read_manifest: copy={} size={} offset={}", .{
|
|
1113
1146
|
@tagName(context.caller),
|
|
1114
|
-
context.copy
|
|
1147
|
+
context.copy.?,
|
|
1115
1148
|
buffer.len,
|
|
1116
1149
|
offset,
|
|
1117
1150
|
});
|
|
@@ -1135,6 +1168,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1135
1168
|
fn read_manifest_callback(read: *Storage.Read) void {
|
|
1136
1169
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1137
1170
|
const superblock = context.superblock;
|
|
1171
|
+
const copy = context.copy.?;
|
|
1138
1172
|
|
|
1139
1173
|
assert(context.caller == .open);
|
|
1140
1174
|
assert(superblock.queue_head == context);
|
|
@@ -1153,13 +1187,13 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1153
1187
|
// TODO Repair any impaired copies before we continue.
|
|
1154
1188
|
// At present, we repair at the next checkpoint.
|
|
1155
1189
|
// We do not repair padding.
|
|
1156
|
-
context.copy =
|
|
1190
|
+
context.copy = 0;
|
|
1157
1191
|
superblock.read_free_set(context);
|
|
1158
|
-
} else if (
|
|
1192
|
+
} else if (copy + 1 == config.superblock_copies) {
|
|
1159
1193
|
@panic("superblock manifest lost");
|
|
1160
1194
|
} else {
|
|
1161
|
-
log.debug("open: read_manifest: corrupt copy={}", .{
|
|
1162
|
-
context.copy
|
|
1195
|
+
log.debug("open: read_manifest: corrupt copy={}", .{copy});
|
|
1196
|
+
context.copy = copy + 1;
|
|
1163
1197
|
superblock.read_manifest(context);
|
|
1164
1198
|
}
|
|
1165
1199
|
}
|
|
@@ -1167,17 +1201,17 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1167
1201
|
fn read_free_set(superblock: *SuperBlock, context: *Context) void {
|
|
1168
1202
|
assert(context.caller == .open);
|
|
1169
1203
|
assert(superblock.queue_head == context);
|
|
1170
|
-
assert(context.copy <
|
|
1204
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1171
1205
|
|
|
1172
1206
|
const size = vsr.sector_ceil(superblock.working.free_set_size);
|
|
1173
1207
|
assert(size <= superblock_trailer_free_set_size_max);
|
|
1174
1208
|
|
|
1175
1209
|
const buffer = superblock.free_set_buffer[0..size];
|
|
1176
|
-
const offset = offset_free_set(context.copy
|
|
1210
|
+
const offset = Layout.offset_free_set(context.copy.?);
|
|
1177
1211
|
|
|
1178
1212
|
log.debug("{s}: read_free_set: copy={} size={} offset={}", .{
|
|
1179
1213
|
@tagName(context.caller),
|
|
1180
|
-
context.copy
|
|
1214
|
+
context.copy.?,
|
|
1181
1215
|
buffer.len,
|
|
1182
1216
|
offset,
|
|
1183
1217
|
});
|
|
@@ -1201,6 +1235,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1201
1235
|
fn read_free_set_callback(read: *Storage.Read) void {
|
|
1202
1236
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1203
1237
|
const superblock = context.superblock;
|
|
1238
|
+
const copy = context.copy.?;
|
|
1204
1239
|
|
|
1205
1240
|
assert(context.caller == .open);
|
|
1206
1241
|
assert(superblock.queue_head == context);
|
|
@@ -1220,11 +1255,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1220
1255
|
|
|
1221
1256
|
// TODO Repair any impaired copies before we continue.
|
|
1222
1257
|
superblock.read_client_table(context);
|
|
1223
|
-
} else if (
|
|
1258
|
+
} else if (copy + 1 == config.superblock_copies) {
|
|
1224
1259
|
@panic("superblock free set lost");
|
|
1225
1260
|
} else {
|
|
1226
|
-
log.debug("open: read_free_set: corrupt copy={}", .{
|
|
1227
|
-
context.copy
|
|
1261
|
+
log.debug("open: read_free_set: corrupt copy={}", .{copy});
|
|
1262
|
+
context.copy = copy + 1;
|
|
1228
1263
|
superblock.read_free_set(context);
|
|
1229
1264
|
}
|
|
1230
1265
|
}
|
|
@@ -1239,17 +1274,17 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1239
1274
|
fn read_client_table(superblock: *SuperBlock, context: *Context) void {
|
|
1240
1275
|
assert(context.caller == .open);
|
|
1241
1276
|
assert(superblock.queue_head == context);
|
|
1242
|
-
assert(context.copy <
|
|
1277
|
+
assert(context.copy.? < config.superblock_copies);
|
|
1243
1278
|
|
|
1244
1279
|
const size = vsr.sector_ceil(superblock.working.client_table_size);
|
|
1245
1280
|
assert(size <= superblock_trailer_client_table_size_max);
|
|
1246
1281
|
|
|
1247
1282
|
const buffer = superblock.client_table_buffer[0..size];
|
|
1248
|
-
const offset = offset_client_table(context.copy
|
|
1283
|
+
const offset = Layout.offset_client_table(context.copy.?);
|
|
1249
1284
|
|
|
1250
1285
|
log.debug("{s}: read_client_table: copy={} size={} offset={}", .{
|
|
1251
1286
|
@tagName(context.caller),
|
|
1252
|
-
context.copy
|
|
1287
|
+
context.copy.?,
|
|
1253
1288
|
buffer.len,
|
|
1254
1289
|
offset,
|
|
1255
1290
|
});
|
|
@@ -1273,6 +1308,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1273
1308
|
fn read_client_table_callback(read: *Storage.Read) void {
|
|
1274
1309
|
const context = @fieldParentPtr(Context, "read", read);
|
|
1275
1310
|
const superblock = context.superblock;
|
|
1311
|
+
const copy = context.copy.?;
|
|
1276
1312
|
|
|
1277
1313
|
assert(context.caller == .open);
|
|
1278
1314
|
assert(superblock.queue_head == context);
|
|
@@ -1288,17 +1324,33 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1288
1324
|
config.clients_max,
|
|
1289
1325
|
});
|
|
1290
1326
|
|
|
1291
|
-
|
|
1292
|
-
superblock.
|
|
1293
|
-
} else if (
|
|
1327
|
+
context.copy = null;
|
|
1328
|
+
superblock.repair(context);
|
|
1329
|
+
} else if (copy + 1 == config.superblock_copies) {
|
|
1294
1330
|
@panic("superblock client table lost");
|
|
1295
1331
|
} else {
|
|
1296
|
-
log.debug("open: read_client_table: corrupt copy={}", .{
|
|
1297
|
-
context.copy
|
|
1332
|
+
log.debug("open: read_client_table: corrupt copy={}", .{copy});
|
|
1333
|
+
context.copy = copy + 1;
|
|
1298
1334
|
superblock.read_client_table(context);
|
|
1299
1335
|
}
|
|
1300
1336
|
}
|
|
1301
1337
|
|
|
1338
|
+
fn repair(superblock: *SuperBlock, context: *Context) void {
|
|
1339
|
+
assert(context.caller == .open);
|
|
1340
|
+
assert(context.copy == null);
|
|
1341
|
+
assert(superblock.queue_head == context);
|
|
1342
|
+
|
|
1343
|
+
if (context.repairs.?.next()) |repair_copy| {
|
|
1344
|
+
context.copy = repair_copy;
|
|
1345
|
+
log.warn("repair: copy={}", .{repair_copy});
|
|
1346
|
+
|
|
1347
|
+
superblock.staging.* = superblock.working.*;
|
|
1348
|
+
superblock.write_manifest(context);
|
|
1349
|
+
} else {
|
|
1350
|
+
superblock.release(context);
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1302
1354
|
fn acquire(superblock: *SuperBlock, context: *Context) void {
|
|
1303
1355
|
if (superblock.queue_head) |head| {
|
|
1304
1356
|
// There should be nothing else happening when we format() or open():
|
|
@@ -1321,11 +1373,10 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1321
1373
|
superblock.queue_head = context;
|
|
1322
1374
|
log.debug("{s}: started", .{@tagName(context.caller)});
|
|
1323
1375
|
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
.view_change => superblock.write_view_change(context),
|
|
1376
|
+
if (context.caller == .open) {
|
|
1377
|
+
superblock.read_working(context, .open);
|
|
1378
|
+
} else {
|
|
1379
|
+
superblock.write_staging(context);
|
|
1329
1380
|
}
|
|
1330
1381
|
}
|
|
1331
1382
|
}
|
|
@@ -1335,20 +1386,24 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1335
1386
|
|
|
1336
1387
|
log.debug("{s}: complete", .{@tagName(context.caller)});
|
|
1337
1388
|
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1389
|
+
switch (context.caller) {
|
|
1390
|
+
.format => {},
|
|
1391
|
+
.open => {
|
|
1392
|
+
assert(!superblock.opened);
|
|
1393
|
+
superblock.opened = true;
|
|
1341
1394
|
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1395
|
+
if (superblock.working.manifest_size > 0) {
|
|
1396
|
+
assert(superblock.manifest.count > 0);
|
|
1397
|
+
}
|
|
1398
|
+
// TODO Make the FreeSet encoding format not dependant on the word size.
|
|
1399
|
+
if (superblock.working.free_set_size > @sizeOf(usize)) {
|
|
1400
|
+
assert(superblock.free_set.count_acquired() > 0);
|
|
1401
|
+
}
|
|
1402
|
+
},
|
|
1403
|
+
.checkpoint, .view_change => {
|
|
1404
|
+
assert(meta.eql(superblock.staging.vsr_state, context.vsr_state.?));
|
|
1405
|
+
assert(meta.eql(superblock.working.vsr_state, context.vsr_state.?));
|
|
1406
|
+
},
|
|
1352
1407
|
}
|
|
1353
1408
|
|
|
1354
1409
|
const queue_tail = superblock.queue_tail;
|
|
@@ -1364,40 +1419,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1364
1419
|
assert(offset + size <= superblock.storage_offset + superblock.storage_size);
|
|
1365
1420
|
}
|
|
1366
1421
|
|
|
1367
|
-
fn offset_manifest(copy: u8, sequence: u64) u64 {
|
|
1368
|
-
assert(copy >= starting_copy_for_sequence(sequence));
|
|
1369
|
-
assert(copy <= stopping_copy_for_sequence(sequence));
|
|
1370
|
-
|
|
1371
|
-
return superblock_size * copy + @sizeOf(SuperBlockSector);
|
|
1372
|
-
}
|
|
1373
|
-
|
|
1374
|
-
fn offset_free_set(copy: u8, sequence: u64) u64 {
|
|
1375
|
-
assert(copy >= starting_copy_for_sequence(sequence));
|
|
1376
|
-
assert(copy <= stopping_copy_for_sequence(sequence));
|
|
1377
|
-
|
|
1378
|
-
return superblock_size * copy + @sizeOf(SuperBlockSector) +
|
|
1379
|
-
superblock_trailer_manifest_size_max;
|
|
1380
|
-
}
|
|
1381
|
-
|
|
1382
|
-
fn offset_client_table(copy: u8, sequence: u64) u64 {
|
|
1383
|
-
assert(copy >= starting_copy_for_sequence(sequence));
|
|
1384
|
-
assert(copy <= stopping_copy_for_sequence(sequence));
|
|
1385
|
-
|
|
1386
|
-
return superblock_size * copy + @sizeOf(SuperBlockSector) +
|
|
1387
|
-
superblock_trailer_manifest_size_max +
|
|
1388
|
-
superblock_trailer_free_set_size_max;
|
|
1389
|
-
}
|
|
1390
|
-
|
|
1391
|
-
/// Returns the first copy index (inclusive) to be written for a sequence number.
|
|
1392
|
-
fn starting_copy_for_sequence(sequence: u64) u8 {
|
|
1393
|
-
return config.superblock_copies * @intCast(u8, sequence % 2);
|
|
1394
|
-
}
|
|
1395
|
-
|
|
1396
|
-
/// Returns the last copy index (inclusive) to be written for a sequence number.
|
|
1397
|
-
fn stopping_copy_for_sequence(sequence: u64) u8 {
|
|
1398
|
-
return starting_copy_for_sequence(sequence) + config.superblock_copies - 1;
|
|
1399
|
-
}
|
|
1400
|
-
|
|
1401
1422
|
/// We use flexible quorums for even quorums with write quorum > read quorum, for example:
|
|
1402
1423
|
/// * When writing, we must verify that at least 3/4 copies were written.
|
|
1403
1424
|
/// * At startup, we must verify that at least 2/4 copies were read.
|
|
@@ -1428,217 +1449,25 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1428
1449
|
};
|
|
1429
1450
|
}
|
|
1430
1451
|
|
|
1431
|
-
const
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
valid: bool = false,
|
|
1436
|
-
};
|
|
1437
|
-
|
|
1438
|
-
const QuorumCount = std.StaticBitSet(superblock_copies_max);
|
|
1439
|
-
|
|
1440
|
-
array: [superblock_copies_max]Quorum = undefined,
|
|
1441
|
-
count: u8 = 0,
|
|
1442
|
-
|
|
1443
|
-
pub const Error = error{
|
|
1444
|
-
NotFound,
|
|
1445
|
-
QuorumLost,
|
|
1446
|
-
ParentNotFound,
|
|
1447
|
-
ParentQuorumLost,
|
|
1448
|
-
SequenceNotMonotonic,
|
|
1449
|
-
VSRStateNotMonotonic,
|
|
1450
|
-
};
|
|
1451
|
-
|
|
1452
|
-
/// Returns the working superblock according to the quorum with the highest sequence number.
|
|
1453
|
-
/// Verifies that the highest quorum is connected, that the previous quorum was not lost.
|
|
1454
|
-
/// i.e. Both the working and previous quorum must be valid and intact and connected.
|
|
1455
|
-
/// Otherwise, we might regress to a previous working superblock.
|
|
1456
|
-
pub fn working(
|
|
1457
|
-
quorums: *Quorums,
|
|
1458
|
-
copies: []SuperBlockSector,
|
|
1459
|
-
threshold: u8,
|
|
1460
|
-
) Error!*const SuperBlockSector {
|
|
1461
|
-
assert(copies.len == superblock_copies_max);
|
|
1462
|
-
assert(threshold >= 2 and threshold <= 5);
|
|
1463
|
-
|
|
1464
|
-
quorums.array = undefined;
|
|
1465
|
-
quorums.count = 0;
|
|
1466
|
-
|
|
1467
|
-
for (copies) |*copy, index| quorums.count_copy(copy, index, threshold);
|
|
1468
|
-
|
|
1469
|
-
std.sort.sort(Quorum, quorums.slice(), {}, sort_priority_descending);
|
|
1470
|
-
|
|
1471
|
-
for (quorums.slice()) |quorum| {
|
|
1472
|
-
if (quorum.count.count() == config.superblock_copies) {
|
|
1473
|
-
log.debug("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
|
|
1474
|
-
quorum.sector.checksum,
|
|
1475
|
-
quorum.sector.parent,
|
|
1476
|
-
quorum.sector.sequence,
|
|
1477
|
-
quorum.count.count(),
|
|
1478
|
-
quorum.valid,
|
|
1479
|
-
});
|
|
1480
|
-
} else {
|
|
1481
|
-
log.err("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
|
|
1482
|
-
quorum.sector.checksum,
|
|
1483
|
-
quorum.sector.parent,
|
|
1484
|
-
quorum.sector.sequence,
|
|
1485
|
-
quorum.count.count(),
|
|
1486
|
-
quorum.valid,
|
|
1487
|
-
});
|
|
1488
|
-
}
|
|
1489
|
-
}
|
|
1490
|
-
|
|
1491
|
-
// No working copies of any sequence number exist in the superblock storage zone at all.
|
|
1492
|
-
if (quorums.slice().len == 0) return error.NotFound;
|
|
1493
|
-
|
|
1494
|
-
// At least one copy or quorum exists.
|
|
1495
|
-
const b = quorums.slice()[0];
|
|
1496
|
-
|
|
1497
|
-
// Verify that the remaining quorums are correctly sorted:
|
|
1498
|
-
for (quorums.slice()[1..]) |a| {
|
|
1499
|
-
assert(sort_priority_descending({}, b, a));
|
|
1500
|
-
assert(a.sector.magic == .superblock);
|
|
1501
|
-
assert(a.sector.valid_checksum());
|
|
1502
|
-
}
|
|
1503
|
-
|
|
1504
|
-
// Even the best copy with the most quorum still has inadequate quorum.
|
|
1505
|
-
if (!b.valid) return error.QuorumLost;
|
|
1506
|
-
|
|
1507
|
-
// The superblock is only partially formatted, not all copies were written.
|
|
1508
|
-
if (b.sector.sequence < 2) return error.NotFound;
|
|
1509
|
-
|
|
1510
|
-
// Verify that the parent copy exists:
|
|
1511
|
-
for (quorums.slice()[1..]) |a| {
|
|
1512
|
-
if (a.sector.cluster != b.sector.cluster) {
|
|
1513
|
-
log.err("superblock copy={} has cluster={} instead of {}", .{
|
|
1514
|
-
a.sector.copy,
|
|
1515
|
-
a.sector.cluster,
|
|
1516
|
-
b.sector.cluster,
|
|
1517
|
-
});
|
|
1518
|
-
} else if (a.sector.replica != b.sector.replica) {
|
|
1519
|
-
log.err("superblock copy={} has replica={} instead of {}", .{
|
|
1520
|
-
a.sector.copy,
|
|
1521
|
-
a.sector.replica,
|
|
1522
|
-
b.sector.replica,
|
|
1523
|
-
});
|
|
1524
|
-
} else if (a.sector.checksum == b.sector.parent) {
|
|
1525
|
-
assert(a.sector.checksum != b.sector.checksum);
|
|
1526
|
-
assert(a.sector.cluster == b.sector.cluster);
|
|
1527
|
-
assert(a.sector.replica == b.sector.replica);
|
|
1528
|
-
|
|
1529
|
-
if (!a.valid) {
|
|
1530
|
-
return error.ParentQuorumLost;
|
|
1531
|
-
} else if (a.sector.sequence >= b.sector.sequence) {
|
|
1532
|
-
return error.SequenceNotMonotonic;
|
|
1533
|
-
} else if (a.sector.sequence % 2 == b.sector.sequence % 2) {
|
|
1534
|
-
// The parent must reside in the alternate copy to guarantee that we are able to
|
|
1535
|
-
// detect when the working quorum is lost.
|
|
1536
|
-
return error.SequenceNotMonotonic;
|
|
1537
|
-
} else if (!a.sector.vsr_state.monotonic(b.sector.vsr_state)) {
|
|
1538
|
-
return error.VSRStateNotMonotonic;
|
|
1539
|
-
} else {
|
|
1540
|
-
assert(b.sector.magic == .superblock);
|
|
1541
|
-
assert(b.sector.valid_checksum());
|
|
1542
|
-
|
|
1543
|
-
return b.sector;
|
|
1544
|
-
}
|
|
1545
|
-
}
|
|
1546
|
-
}
|
|
1547
|
-
|
|
1548
|
-
return error.ParentNotFound;
|
|
1549
|
-
}
|
|
1550
|
-
|
|
1551
|
-
fn count_copy(
|
|
1552
|
-
quorums: *Quorums,
|
|
1553
|
-
copy: *const SuperBlockSector,
|
|
1554
|
-
index: usize,
|
|
1555
|
-
threshold: u8,
|
|
1556
|
-
) void {
|
|
1557
|
-
assert(index < superblock_copies_max);
|
|
1558
|
-
assert(threshold >= 2 and threshold <= 5);
|
|
1559
|
-
|
|
1560
|
-
if (!copy.valid_checksum()) {
|
|
1561
|
-
log.debug("copy: {}/{}: invalid checksum", .{ index, superblock_copies_max });
|
|
1562
|
-
return;
|
|
1563
|
-
}
|
|
1564
|
-
|
|
1565
|
-
if (copy.magic != .superblock) {
|
|
1566
|
-
log.debug("copy: {}/{}: not a superblock", .{ index, superblock_copies_max });
|
|
1567
|
-
return;
|
|
1568
|
-
}
|
|
1569
|
-
|
|
1570
|
-
if (copy.copy == index) {
|
|
1571
|
-
log.debug("copy: {}/{}: checksum={x} parent={x} sequence={}", .{
|
|
1572
|
-
index,
|
|
1573
|
-
superblock_copies_max,
|
|
1574
|
-
copy.checksum,
|
|
1575
|
-
copy.parent,
|
|
1576
|
-
copy.sequence,
|
|
1577
|
-
});
|
|
1578
|
-
} else {
|
|
1579
|
-
// If our read was misdirected, we definitely still want to count the copy.
|
|
1580
|
-
// We must just be careful to count it idempotently.
|
|
1581
|
-
log.err(
|
|
1582
|
-
"copy: {}/{}: checksum={x} parent={x} sequence={} misdirected from copy={}",
|
|
1583
|
-
.{
|
|
1584
|
-
index,
|
|
1585
|
-
superblock_copies_max,
|
|
1586
|
-
copy.checksum,
|
|
1587
|
-
copy.parent,
|
|
1588
|
-
copy.sequence,
|
|
1589
|
-
copy.copy,
|
|
1590
|
-
},
|
|
1591
|
-
);
|
|
1592
|
-
}
|
|
1593
|
-
|
|
1594
|
-
var quorum = quorums.find_or_insert_quorum_for_copy(copy);
|
|
1595
|
-
assert(quorum.sector.checksum == copy.checksum);
|
|
1596
|
-
assert(quorum.sector.equal(copy));
|
|
1597
|
-
|
|
1598
|
-
quorum.count.set(copy.copy);
|
|
1599
|
-
assert(quorum.count.isSet(copy.copy));
|
|
1600
|
-
|
|
1601
|
-
// In the worst case, all copies may contain divergent forks of the same sequence.
|
|
1602
|
-
// However, this should not happen for the same checksum.
|
|
1603
|
-
assert(quorum.count.count() <= config.superblock_copies);
|
|
1604
|
-
|
|
1605
|
-
quorum.valid = quorum.count.count() >= threshold;
|
|
1452
|
+
pub const Layout = struct {
|
|
1453
|
+
pub fn offset_sector(copy: u8) u64 {
|
|
1454
|
+
assert(copy < config.superblock_copies);
|
|
1455
|
+
return superblock_copy_size * @as(u64, copy);
|
|
1606
1456
|
}
|
|
1607
1457
|
|
|
1608
|
-
fn
|
|
1609
|
-
assert(copy
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
for (quorums.array[0..quorums.count]) |*quorum| {
|
|
1613
|
-
if (copy.checksum == quorum.sector.checksum) return quorum;
|
|
1614
|
-
} else {
|
|
1615
|
-
quorums.array[quorums.count] = Quorum{ .sector = copy };
|
|
1616
|
-
quorums.count += 1;
|
|
1617
|
-
|
|
1618
|
-
return &quorums.array[quorums.count - 1];
|
|
1619
|
-
}
|
|
1458
|
+
pub fn offset_manifest(copy: u8) u64 {
|
|
1459
|
+
assert(copy < config.superblock_copies);
|
|
1460
|
+
return offset_sector(copy) + @sizeOf(SuperBlockSector);
|
|
1620
1461
|
}
|
|
1621
1462
|
|
|
1622
|
-
fn
|
|
1623
|
-
|
|
1463
|
+
pub fn offset_free_set(copy: u8) u64 {
|
|
1464
|
+
assert(copy < config.superblock_copies);
|
|
1465
|
+
return offset_manifest(copy) + superblock_trailer_manifest_size_max;
|
|
1624
1466
|
}
|
|
1625
1467
|
|
|
1626
|
-
fn
|
|
1627
|
-
assert(
|
|
1628
|
-
|
|
1629
|
-
assert(b.sector.magic == .superblock);
|
|
1630
|
-
|
|
1631
|
-
if (a.valid and !b.valid) return true;
|
|
1632
|
-
if (b.valid and !a.valid) return false;
|
|
1633
|
-
|
|
1634
|
-
if (a.sector.sequence > b.sector.sequence) return true;
|
|
1635
|
-
if (b.sector.sequence > a.sector.sequence) return false;
|
|
1636
|
-
|
|
1637
|
-
if (a.count.count() > b.count.count()) return true;
|
|
1638
|
-
if (b.count.count() > a.count.count()) return false;
|
|
1639
|
-
|
|
1640
|
-
// The sort order must be stable and deterministic:
|
|
1641
|
-
return a.sector.checksum > b.sector.checksum;
|
|
1468
|
+
pub fn offset_client_table(copy: u8) u64 {
|
|
1469
|
+
assert(copy < config.superblock_copies);
|
|
1470
|
+
return offset_free_set(copy) + superblock_trailer_free_set_size_max;
|
|
1642
1471
|
}
|
|
1643
1472
|
};
|
|
1644
1473
|
|
|
@@ -1657,105 +1486,3 @@ test "SuperBlockSector" {
|
|
|
1657
1486
|
a.replica += 1;
|
|
1658
1487
|
try expect(!a.valid_checksum());
|
|
1659
1488
|
}
|
|
1660
|
-
|
|
1661
|
-
// TODO Add unit tests for Quorums.
|
|
1662
|
-
// TODO Test invariants and transitions across TestRunner functions.
|
|
1663
|
-
const TestStorage = @import("../test/storage.zig").Storage;
|
|
1664
|
-
const TestSuperBlock = SuperBlockType(TestStorage);
|
|
1665
|
-
|
|
1666
|
-
const TestRunner = struct {
|
|
1667
|
-
superblock: *TestSuperBlock,
|
|
1668
|
-
context_format: TestSuperBlock.Context = undefined,
|
|
1669
|
-
context_open: TestSuperBlock.Context = undefined,
|
|
1670
|
-
context_checkpoint: TestSuperBlock.Context = undefined,
|
|
1671
|
-
context_view_change: TestSuperBlock.Context = undefined,
|
|
1672
|
-
pending: usize = 0,
|
|
1673
|
-
|
|
1674
|
-
fn format(runner: *TestRunner, options: TestSuperBlock.FormatOptions) void {
|
|
1675
|
-
runner.pending += 1;
|
|
1676
|
-
runner.superblock.format(format_callback, &runner.context_format, options);
|
|
1677
|
-
}
|
|
1678
|
-
|
|
1679
|
-
fn format_callback(context: *TestSuperBlock.Context) void {
|
|
1680
|
-
const runner = @fieldParentPtr(TestRunner, "context_format", context);
|
|
1681
|
-
runner.pending -= 1;
|
|
1682
|
-
runner.open();
|
|
1683
|
-
}
|
|
1684
|
-
|
|
1685
|
-
fn open(runner: *TestRunner) void {
|
|
1686
|
-
runner.pending += 1;
|
|
1687
|
-
runner.superblock.open(open_callback, &runner.context_open);
|
|
1688
|
-
}
|
|
1689
|
-
|
|
1690
|
-
fn open_callback(context: *TestSuperBlock.Context) void {
|
|
1691
|
-
const runner = @fieldParentPtr(TestRunner, "context_open", context);
|
|
1692
|
-
runner.pending -= 1;
|
|
1693
|
-
runner.checkpoint();
|
|
1694
|
-
runner.view_change();
|
|
1695
|
-
}
|
|
1696
|
-
|
|
1697
|
-
fn view_change(runner: *TestRunner) void {
|
|
1698
|
-
runner.pending += 1;
|
|
1699
|
-
runner.superblock.view_change(
|
|
1700
|
-
view_change_callback,
|
|
1701
|
-
&runner.context_view_change,
|
|
1702
|
-
.{
|
|
1703
|
-
.commit_min = runner.superblock.working.vsr_state.commit_min + 1,
|
|
1704
|
-
.commit_max = runner.superblock.working.vsr_state.commit_max + 2,
|
|
1705
|
-
.view_normal = runner.superblock.working.vsr_state.view_normal + 3,
|
|
1706
|
-
.view = runner.superblock.working.vsr_state.view + 4,
|
|
1707
|
-
},
|
|
1708
|
-
);
|
|
1709
|
-
}
|
|
1710
|
-
|
|
1711
|
-
fn view_change_callback(context: *TestSuperBlock.Context) void {
|
|
1712
|
-
const runner = @fieldParentPtr(TestRunner, "context_view_change", context);
|
|
1713
|
-
runner.pending -= 1;
|
|
1714
|
-
runner.checkpoint();
|
|
1715
|
-
}
|
|
1716
|
-
|
|
1717
|
-
fn checkpoint(runner: *TestRunner) void {
|
|
1718
|
-
runner.pending += 1;
|
|
1719
|
-
runner.superblock.checkpoint(checkpoint_callback, &runner.context_checkpoint);
|
|
1720
|
-
}
|
|
1721
|
-
|
|
1722
|
-
fn checkpoint_callback(context: *TestSuperBlock.Context) void {
|
|
1723
|
-
const runner = @fieldParentPtr(TestRunner, "context_checkpoint", context);
|
|
1724
|
-
runner.pending -= 1;
|
|
1725
|
-
}
|
|
1726
|
-
};
|
|
1727
|
-
|
|
1728
|
-
test "SuperBlock" {
|
|
1729
|
-
const cluster = 32;
|
|
1730
|
-
const replica = 4;
|
|
1731
|
-
const size_max = data_file_size_min;
|
|
1732
|
-
|
|
1733
|
-
var storage = try TestStorage.init(std.testing.allocator, superblock_zone_size, .{
|
|
1734
|
-
.seed = 0,
|
|
1735
|
-
.read_latency_min = 1,
|
|
1736
|
-
.read_latency_mean = 1,
|
|
1737
|
-
.write_latency_min = 1,
|
|
1738
|
-
.write_latency_mean = 1,
|
|
1739
|
-
.read_fault_probability = 0,
|
|
1740
|
-
.write_fault_probability = 0,
|
|
1741
|
-
}, replica, .{
|
|
1742
|
-
.first_offset = superblock_zone_size,
|
|
1743
|
-
.period = 1,
|
|
1744
|
-
});
|
|
1745
|
-
defer storage.deinit(std.testing.allocator);
|
|
1746
|
-
|
|
1747
|
-
var message_pool = try MessagePool.init(std.testing.allocator, .replica);
|
|
1748
|
-
defer message_pool.deinit(std.testing.allocator);
|
|
1749
|
-
|
|
1750
|
-
var superblock = try TestSuperBlock.init(std.testing.allocator, &storage, &message_pool);
|
|
1751
|
-
defer superblock.deinit(std.testing.allocator);
|
|
1752
|
-
|
|
1753
|
-
var runner = TestRunner{ .superblock = &superblock };
|
|
1754
|
-
runner.format(.{
|
|
1755
|
-
.cluster = cluster,
|
|
1756
|
-
.replica = replica,
|
|
1757
|
-
.size_max = size_max,
|
|
1758
|
-
});
|
|
1759
|
-
|
|
1760
|
-
while (runner.pending > 0) storage.tick();
|
|
1761
|
-
}
|