tigerbeetle-node 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +3 -2
  2. package/dist/index.d.ts +66 -61
  3. package/dist/index.js +66 -61
  4. package/dist/index.js.map +1 -1
  5. package/package.json +1 -1
  6. package/src/index.ts +5 -0
  7. package/src/node.zig +17 -18
  8. package/src/tigerbeetle/scripts/benchmark.bat +4 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +25 -10
  10. package/src/tigerbeetle/scripts/install.sh +2 -1
  11. package/src/tigerbeetle/scripts/install_zig.sh +14 -18
  12. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
  13. package/src/tigerbeetle/scripts/vopr.sh +5 -5
  14. package/src/tigerbeetle/src/benchmark.zig +17 -9
  15. package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
  16. package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
  17. package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
  18. package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
  19. package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
  20. package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
  21. package/src/tigerbeetle/src/c/tb_client.h +201 -0
  22. package/src/tigerbeetle/src/c/tb_client.zig +101 -0
  23. package/src/tigerbeetle/src/c/test.zig +1 -0
  24. package/src/tigerbeetle/src/cli.zig +142 -83
  25. package/src/tigerbeetle/src/config.zig +119 -10
  26. package/src/tigerbeetle/src/demo.zig +12 -8
  27. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
  28. package/src/tigerbeetle/src/ewah.zig +318 -0
  29. package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
  30. package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
  31. package/src/tigerbeetle/src/fifo.zig +17 -1
  32. package/src/tigerbeetle/src/io/darwin.zig +12 -10
  33. package/src/tigerbeetle/src/io/linux.zig +25 -9
  34. package/src/tigerbeetle/src/io/windows.zig +13 -9
  35. package/src/tigerbeetle/src/iops.zig +101 -0
  36. package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
  37. package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
  38. package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
  39. package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
  40. package/src/tigerbeetle/src/lsm/direction.zig +11 -0
  41. package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
  42. package/src/tigerbeetle/src/lsm/forest.zig +630 -0
  43. package/src/tigerbeetle/src/lsm/grid.zig +473 -0
  44. package/src/tigerbeetle/src/lsm/groove.zig +939 -0
  45. package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
  46. package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
  47. package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
  48. package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
  49. package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
  50. package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
  51. package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
  52. package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
  53. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
  54. package/src/tigerbeetle/src/lsm/table.zig +932 -0
  55. package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
  56. package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
  57. package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
  58. package/src/tigerbeetle/src/lsm/test.zig +429 -0
  59. package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
  60. package/src/tigerbeetle/src/main.zig +119 -109
  61. package/src/tigerbeetle/src/message_bus.zig +49 -48
  62. package/src/tigerbeetle/src/message_pool.zig +15 -2
  63. package/src/tigerbeetle/src/ring_buffer.zig +126 -30
  64. package/src/tigerbeetle/src/simulator.zig +76 -44
  65. package/src/tigerbeetle/src/state_machine.zig +1022 -585
  66. package/src/tigerbeetle/src/storage.zig +46 -16
  67. package/src/tigerbeetle/src/test/cluster.zig +109 -63
  68. package/src/tigerbeetle/src/test/message_bus.zig +15 -24
  69. package/src/tigerbeetle/src/test/network.zig +26 -17
  70. package/src/tigerbeetle/src/test/state_checker.zig +7 -5
  71. package/src/tigerbeetle/src/test/state_machine.zig +159 -69
  72. package/src/tigerbeetle/src/test/storage.zig +57 -28
  73. package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
  74. package/src/tigerbeetle/src/unit_tests.zig +8 -0
  75. package/src/tigerbeetle/src/util.zig +51 -0
  76. package/src/tigerbeetle/src/vsr/client.zig +21 -7
  77. package/src/tigerbeetle/src/vsr/journal.zig +154 -167
  78. package/src/tigerbeetle/src/vsr/replica.zig +744 -226
  79. package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
  80. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
  81. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
  82. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
  83. package/src/tigerbeetle/src/vsr.zig +43 -115
@@ -0,0 +1,1743 @@
1
+ const std = @import("std");
2
+ const assert = std.debug.assert;
3
+ const crypto = std.crypto;
4
+ const mem = std.mem;
5
+ const meta = std.meta;
6
+ const os = std.os;
7
+
8
+ const config = @import("../config.zig");
9
+ const div_ceil = @import("../util.zig").div_ceil;
10
+ const vsr = @import("../vsr.zig");
11
+ const log = std.log.scoped(.superblock);
12
+
13
+ const MessagePool = @import("../message_pool.zig").MessagePool;
14
+
15
+ pub const SuperBlockManifest = @import("superblock_manifest.zig").Manifest;
16
+ pub const SuperBlockFreeSet = @import("superblock_free_set.zig").FreeSet;
17
+ pub const SuperBlockClientTable = @import("superblock_client_table.zig").ClientTable;
18
+
19
+ /// Identifies the type of a sector or block. Protects against misdirected I/O across valid types.
20
+ pub const Magic = enum(u8) {
21
+ superblock,
22
+ manifest,
23
+ prepare,
24
+ index,
25
+ filter,
26
+ data,
27
+ };
28
+
29
+ pub const SuperBlockVersion: u8 = 0;
30
+
31
+ // Fields are aligned to work as an extern or packed struct.
32
+ pub const SuperBlockSector = extern struct {
33
+ checksum: u128 = undefined,
34
+
35
+ /// Protects against misdirected reads at startup.
36
+ /// For example, if multiple reads are all misdirected to a single copy of the superblock.
37
+ /// Excluded from the checksum calculation to ensure that all copies have the same checksum.
38
+ /// This simplifies writing and comparing multiple copies.
39
+ copy: u8 = 0,
40
+
41
+ /// Protects against misdirected I/O for non-superblock sectors that have a valid checksum.
42
+ magic: Magic,
43
+
44
+ /// The version of the superblock format in use, reserved for major breaking changes.
45
+ version: u8,
46
+
47
+ /// Protects against writing to or reading from the wrong data file.
48
+ replica: u8,
49
+ cluster: u32,
50
+
51
+ /// The current size of the data file.
52
+ size: u64,
53
+
54
+ /// The maximum size of the data file.
55
+ size_max: u64,
56
+
57
+ /// A monotonically increasing counter to locate the latest superblock at startup.
58
+ sequence: u64,
59
+
60
+ /// The checksum of the previous superblock to hash chain across sequence numbers.
61
+ parent: u128,
62
+
63
+ /// The checksum over the manifest block references in the superblock trailer.
64
+ manifest_checksum: u128,
65
+
66
+ /// The checksum over the actual encoded block free set in the superblock trailer.
67
+ free_set_checksum: u128,
68
+
69
+ /// The checksum over the client table entries in the superblock trailer.
70
+ client_table_checksum: u128,
71
+
72
+ /// State stored on stable storage for the Viewstamped Replication consensus protocol.
73
+ vsr_state: VSRState,
74
+
75
+ /// Reserved for future minor features (e.g. changing the compression algorithm of the trailer).
76
+ flags: u64 = 0,
77
+
78
+ /// A listing of persistent read snapshots that have been issued to clients.
79
+ /// A snapshot.created timestamp of 0 indicates that the snapshot is null.
80
+ snapshots: [config.lsm_snapshots_max]Snapshot,
81
+
82
+ /// The size of the manifest block references stored in the superblock trailer.
83
+ /// The block addresses and checksums in this section of the trailer are laid out as follows:
84
+ /// [manifest_size / (16 + 8 + 1)]u128 checksum
85
+ /// [manifest_size / (16 + 8 + 1)]u64 address
86
+ /// [manifest_size / (16 + 8 + 1)]u8 tree
87
+ manifest_size: u32,
88
+
89
+ /// The size of the block free set stored in the superblock trailer.
90
+ free_set_size: u32,
91
+
92
+ /// The size of the client table entries stored in the superblock trailer.
93
+ client_table_size: u32,
94
+
95
+ reserved: [3160]u8 = [_]u8{0} ** 3160,
96
+
97
+ pub const VSRState = extern struct {
98
+ /// The last operation committed to the state machine. At startup, replay the log hereafter.
99
+ commit_min: u64,
100
+
101
+ /// The highest operation up to which we may commit.
102
+ commit_max: u64,
103
+
104
+ /// The last view in which the replica's status was normal.
105
+ view_normal: u32,
106
+
107
+ /// The view number of the replica.
108
+ view: u32,
109
+
110
+ comptime {
111
+ assert(@sizeOf(VSRState) == 24);
112
+ }
113
+
114
+ pub fn internally_consistent(state: VSRState) bool {
115
+ return state.commit_max >= state.commit_min and state.view >= state.view_normal;
116
+ }
117
+
118
+ pub fn monotonic(old: VSRState, new: VSRState) bool {
119
+ assert(old.internally_consistent());
120
+ assert(new.internally_consistent());
121
+
122
+ if (old.view > new.view) return false;
123
+ if (old.view_normal > new.view_normal) return false;
124
+ if (old.commit_min > new.commit_min) return false;
125
+ if (old.commit_max > new.commit_max) return false;
126
+
127
+ return true;
128
+ }
129
+
130
+ pub fn would_be_updated_by(old: VSRState, new: VSRState) bool {
131
+ assert(monotonic(old, new));
132
+
133
+ return !meta.eql(old, new);
134
+ }
135
+
136
+ pub fn update(state: *VSRState, new: VSRState) void {
137
+ assert(state.would_be_updated_by(new));
138
+ state.* = new;
139
+ }
140
+ };
141
+
142
+ pub const Snapshot = extern struct {
143
+ /// A creation timestamp of 0 indicates that the snapshot is null.
144
+ created: u64,
145
+
146
+ /// When a read query last used the snapshot.
147
+ queried: u64,
148
+
149
+ /// Snapshots may auto-expire after a timeout of inactivity.
150
+ /// A timeout of 0 indicates that the snapshot must be explicitly released by the user.
151
+ timeout: u64,
152
+
153
+ pub fn exists(snapshot: Snapshot) bool {
154
+ if (snapshot.created == 0) {
155
+ assert(snapshot.queried == 0);
156
+ assert(snapshot.timeout == 0);
157
+
158
+ return false;
159
+ } else {
160
+ return true;
161
+ }
162
+ }
163
+
164
+ comptime {
165
+ assert(@sizeOf(Snapshot) == 24);
166
+ }
167
+ };
168
+
169
+ comptime {
170
+ assert(@sizeOf(SuperBlockSector) == config.sector_size);
171
+ }
172
+
173
+ pub fn calculate_checksum(superblock: *const SuperBlockSector) u128 {
174
+ comptime assert(meta.fieldIndex(SuperBlockSector, "checksum") == 0);
175
+ comptime assert(meta.fieldIndex(SuperBlockSector, "copy") == 1);
176
+
177
+ const checksum_size = @sizeOf(@TypeOf(superblock.checksum));
178
+ comptime assert(checksum_size == @sizeOf(u128));
179
+
180
+ const copy_size = @sizeOf(@TypeOf(superblock.copy));
181
+ comptime assert(copy_size == 1);
182
+
183
+ const ignore_size = checksum_size + copy_size;
184
+
185
+ return vsr.checksum(std.mem.asBytes(superblock)[ignore_size..]);
186
+ }
187
+
188
+ pub fn set_checksum(superblock: *SuperBlockSector) void {
189
+ assert(superblock.copy < superblock_copies_max);
190
+ assert(superblock.magic == .superblock);
191
+ assert(superblock.version == SuperBlockVersion);
192
+ assert(superblock.flags == 0);
193
+
194
+ for (mem.bytesAsSlice(u64, &superblock.reserved)) |word| assert(word == 0);
195
+
196
+ superblock.checksum = superblock.calculate_checksum();
197
+ }
198
+
199
+ pub fn valid_checksum(superblock: *const SuperBlockSector) bool {
200
+ return superblock.checksum == superblock.calculate_checksum();
201
+ }
202
+
203
+ /// Does not consider { checksum, copy } when comparing equality.
204
+ pub fn equal(a: *const SuperBlockSector, b: *const SuperBlockSector) bool {
205
+ assert(a.magic == .superblock);
206
+ assert(b.magic == .superblock);
207
+
208
+ if (a.version != b.version) return false;
209
+ if (a.replica != b.replica) return false;
210
+ if (a.cluster != b.cluster) return false;
211
+ if (a.size != b.size) return false;
212
+ if (a.size_max != b.size_max) return false;
213
+ if (a.sequence != b.sequence) return false;
214
+ if (a.parent != b.parent) return false;
215
+ if (a.manifest_checksum != b.manifest_checksum) return false;
216
+ if (a.free_set_checksum != b.free_set_checksum) return false;
217
+ if (a.client_table_checksum != b.client_table_checksum) return false;
218
+ if (!meta.eql(a.vsr_state, b.vsr_state)) return false;
219
+ if (a.flags != b.flags) return false;
220
+ if (!meta.eql(a.snapshots, b.snapshots)) return false;
221
+ if (a.manifest_size != b.manifest_size) return false;
222
+ if (a.free_set_size != b.free_set_size) return false;
223
+
224
+ for (mem.bytesAsSlice(u64, &a.reserved)) |word| assert(word == 0);
225
+ for (mem.bytesAsSlice(u64, &b.reserved)) |word| assert(word == 0);
226
+
227
+ return true;
228
+ }
229
+ };
230
+
231
+ comptime {
232
+ switch (config.superblock_copies) {
233
+ 4, 6, 8 => {},
234
+ else => @compileError("superblock_copies must be either { 4, 6, 8 } for flexible quorums."),
235
+ }
236
+ }
237
+
238
+ /// The size of the entire superblock storage zone.
239
+ pub const superblock_zone_size = superblock_size * superblock_copies_max;
240
+
241
+ /// A single set of copies (a copy set) consists of config.superblock_copies of a superblock.
242
+ /// At least two copy sets are required for copy-on-write in order not to impair existing copies.
243
+ ///
244
+ /// However, when writing only the superblock sector for a view change, we do update-in-place,
245
+ /// which is necessary as we need to continue to reference the existing superblock trailer to
246
+ /// decouple view changes from checkpoints, to not force an untimely checkpoint ahead of schedule.
247
+ pub const superblock_copies_max = config.superblock_copies * 2;
248
+
249
+ /// The size of an individual superblock including trailer.
250
+ pub const superblock_size = @sizeOf(SuperBlockSector) + superblock_trailer_size_max;
251
+ comptime {
252
+ assert(superblock_size % config.sector_size == 0);
253
+ }
254
+
255
+ /// The maximum possible size of the superblock trailer, following the superblock sector.
256
+ pub const superblock_trailer_size_max = blk: {
257
+ // To calculate the size of the superblock trailer we need to know:
258
+ // 1. the maximum number of manifest blocks that should be able to be referenced, and
259
+ // 2. the maximum possible size of the EWAH-compressed bit set addressable by the free set.
260
+
261
+ assert(superblock_trailer_manifest_size_max > 0);
262
+ assert(superblock_trailer_manifest_size_max % config.sector_size == 0);
263
+ assert(superblock_trailer_manifest_size_max % SuperBlockManifest.BlockReferenceSize == 0);
264
+
265
+ assert(superblock_trailer_free_set_size_max > 0);
266
+ assert(superblock_trailer_free_set_size_max % config.sector_size == 0);
267
+
268
+ assert(superblock_trailer_client_table_size_max > 0);
269
+ assert(superblock_trailer_client_table_size_max % config.sector_size == 0);
270
+
271
+ // We order the smaller manifest section ahead of the block free set for better access locality.
272
+ // For example, it's cheaper to skip over 1 MiB when reading from disk than to skip over 32 MiB.
273
+ break :blk superblock_trailer_manifest_size_max + superblock_trailer_free_set_size_max + superblock_trailer_client_table_size_max;
274
+ };
275
+
276
+ // A manifest block reference of 40 bytes contains a tree hash, checksum, and address.
277
+ // These references are stored in struct-of-arrays layout in the trailer for the sake of alignment.
278
+ pub const superblock_trailer_manifest_size_max = blk: {
279
+ assert(SuperBlockManifest.BlockReferenceSize == 16 + 16 + 8);
280
+
281
+ // Use a multiple of sector * reference so that the size is exactly divisible without padding:
282
+ // For example, this 2.5 MiB manifest trailer == 65536 references == 65536 * 511 or 34m tables.
283
+ break :blk 16 * config.sector_size * SuperBlockManifest.BlockReferenceSize;
284
+ };
285
+
286
+ pub const superblock_trailer_free_set_size_max = blk: {
287
+ const encode_size_max = SuperBlockFreeSet.encode_size_max(config.block_count_max);
288
+ assert(encode_size_max > 0);
289
+
290
+ // Round up to the nearest sector:
291
+ break :blk div_ceil(encode_size_max, config.sector_size) * config.sector_size;
292
+ };
293
+
294
+ pub const superblock_trailer_client_table_size_max = blk: {
295
+ const encode_size_max = SuperBlockClientTable.encode_size_max;
296
+ assert(encode_size_max > 0);
297
+
298
+ // Round up to the nearest sector:
299
+ break :blk div_ceil(encode_size_max, config.sector_size) * config.sector_size;
300
+ };
301
+
302
+ pub const data_file_size_min = blk: {
303
+ break :blk superblock_zone_size + config.journal_size_max;
304
+ };
305
+
306
+ pub fn SuperBlockType(comptime Storage: type) type {
307
+ return struct {
308
+ const SuperBlock = @This();
309
+
310
+ pub const Manifest = SuperBlockManifest;
311
+ pub const FreeSet = SuperBlockFreeSet;
312
+ pub const ClientTable = SuperBlockClientTable;
313
+
314
+ pub const Context = struct {
315
+ pub const Caller = enum {
316
+ format,
317
+ open,
318
+ checkpoint,
319
+ view_change,
320
+ };
321
+
322
+ superblock: *SuperBlock,
323
+ callback: fn (context: *Context) void,
324
+ caller: Caller,
325
+
326
+ write: Storage.Write = undefined,
327
+ read: Storage.Read = undefined,
328
+ copy: u8 = undefined,
329
+ vsr_state: SuperBlockSector.VSRState = undefined,
330
+ };
331
+
332
+ storage: *Storage,
333
+
334
+ /// The first logical offset that may be written to the superblock storage zone.
335
+ storage_offset: u64 = 0,
336
+
337
+ /// The total size of the superblock storage zone after this physical offset.
338
+ storage_size: u64 = superblock_zone_size,
339
+
340
+ /// The superblock that was recovered at startup after a crash or that was last written.
341
+ working: *align(config.sector_size) SuperBlockSector,
342
+
343
+ /// The superblock that will replace the current working superblock once written.
344
+ /// This is used when writing the staging superblock, or when changing views before then.
345
+ /// We cannot mutate any working state directly until it is safely on stable storage.
346
+ /// Otherwise, we may accidentally externalize guarantees that are not yet durable.
347
+ writing: *align(config.sector_size) SuperBlockSector,
348
+
349
+ /// The superblock that will be checkpointed next.
350
+ /// This may be updated incrementally several times before the next checkpoint.
351
+ /// For example, to track new snapshots as they are registered.
352
+ staging: *align(config.sector_size) SuperBlockSector,
353
+
354
+ /// The copies that we read into at startup or when verifying the written superblock.
355
+ reading: []align(config.sector_size) SuperBlockSector,
356
+
357
+ /// It might seem that, at startup, we simply install the copy with the highest sequence.
358
+ ///
359
+ /// However, there's a scenario where:
360
+ /// 1. We are able to write sequence 7 to 3/4 copies, with the last write being lost.
361
+ /// 2. We startup and read all copies, with reads misdirected to the copy with sequence 6.
362
+ ///
363
+ /// Another scenario:
364
+ /// 1. We begin to write sequence 7 to 1 copy and then crash.
365
+ /// 2. At startup, the read to this copy fails, and we recover at sequence=6.
366
+ /// 3. We then checkpoint another sequence 7 to 3/4 copies and crash.
367
+ /// 4. At startup, we then see 4 copies with the same sequence with 1 checksum different.
368
+ ///
369
+ /// To mitigate these scenarios, we ensure that we are able to read a quorum of copies.
370
+ /// This also gives us confidence that our working superblock has sufficient redundancy.
371
+ quorums: Quorums = Quorums{},
372
+
373
+ manifest: Manifest,
374
+ free_set: FreeSet,
375
+ client_table: ClientTable,
376
+
377
+ manifest_buffer: []align(config.sector_size) u8,
378
+ free_set_buffer: []align(config.sector_size) u8,
379
+ client_table_buffer: []align(config.sector_size) u8,
380
+
381
+ /// Whether the superblock has been opened. An open superblock may not be formatted.
382
+ opened: bool = false,
383
+
384
+ /// Beyond formatting and opening of the superblock, which are mutually exclusive of all
385
+ /// other operations, only the following queue combinations are allowed:
386
+ /// 1. A view change may queue on a checkpoint.
387
+ /// 2. A checkpoint may queue on a view change.
388
+ ///
389
+ /// There may only be a single caller queued at a time, to ensure that the VSR protocol is
390
+ /// careful to submit at most one view change at a time.
391
+ queue_head: ?*Context = null,
392
+ queue_tail: ?*Context = null,
393
+
394
+ pub fn init(
395
+ allocator: mem.Allocator,
396
+ storage: *Storage,
397
+ message_pool: *MessagePool,
398
+ ) !SuperBlock {
399
+ const a = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
400
+ errdefer allocator.free(a);
401
+
402
+ const b = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
403
+ errdefer allocator.free(b);
404
+
405
+ const c = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
406
+ errdefer allocator.free(c);
407
+
408
+ const reading = try allocator.allocAdvanced(
409
+ [config.superblock_copies * 2]SuperBlockSector,
410
+ config.sector_size,
411
+ 1,
412
+ .exact,
413
+ );
414
+ errdefer allocator.free(reading);
415
+
416
+ var manifest = try Manifest.init(
417
+ allocator,
418
+ @divExact(
419
+ superblock_trailer_manifest_size_max,
420
+ Manifest.BlockReferenceSize,
421
+ ),
422
+ @import("../lsm/tree.zig").table_count_max,
423
+ );
424
+ errdefer manifest.deinit(allocator);
425
+
426
+ var free_set = try FreeSet.init(allocator, config.block_count_max);
427
+ errdefer free_set.deinit(allocator);
428
+
429
+ var client_table = try ClientTable.init(allocator, message_pool);
430
+ errdefer client_table.deinit(allocator);
431
+
432
+ const manifest_buffer = try allocator.allocAdvanced(
433
+ u8,
434
+ config.sector_size,
435
+ superblock_trailer_manifest_size_max,
436
+ .exact,
437
+ );
438
+ errdefer allocator.free(manifest_buffer);
439
+
440
+ const free_set_buffer = try allocator.allocAdvanced(
441
+ u8,
442
+ config.sector_size,
443
+ superblock_trailer_free_set_size_max,
444
+ .exact,
445
+ );
446
+ errdefer allocator.free(free_set_buffer);
447
+
448
+ const client_table_buffer = try allocator.allocAdvanced(
449
+ u8,
450
+ config.sector_size,
451
+ superblock_trailer_client_table_size_max,
452
+ .exact,
453
+ );
454
+ errdefer allocator.free(client_table_buffer);
455
+
456
+ return SuperBlock{
457
+ .storage = storage,
458
+ .working = &a[0],
459
+ .writing = &b[0],
460
+ .staging = &c[0],
461
+ .reading = &reading[0],
462
+ .manifest = manifest,
463
+ .free_set = free_set,
464
+ .client_table = client_table,
465
+ .manifest_buffer = manifest_buffer,
466
+ .free_set_buffer = free_set_buffer,
467
+ .client_table_buffer = client_table_buffer,
468
+ };
469
+ }
470
+
471
+ pub fn deinit(superblock: *SuperBlock, allocator: mem.Allocator) void {
472
+ assert(superblock.queue_head == null);
473
+ assert(superblock.queue_tail == null);
474
+
475
+ allocator.destroy(superblock.working);
476
+ allocator.destroy(superblock.writing);
477
+ allocator.destroy(superblock.staging);
478
+ allocator.free(superblock.reading);
479
+
480
+ superblock.manifest.deinit(allocator);
481
+ superblock.free_set.deinit(allocator);
482
+ superblock.client_table.deinit(allocator);
483
+
484
+ allocator.free(superblock.manifest_buffer);
485
+ allocator.free(superblock.free_set_buffer);
486
+ allocator.free(superblock.client_table_buffer);
487
+ }
488
+
489
+ pub const FormatOptions = struct {
490
+ cluster: u32,
491
+ replica: u8,
492
+
493
+ /// The maximum size of the entire data file.
494
+ size_max: u64,
495
+ };
496
+
497
+ pub fn format(
498
+ superblock: *SuperBlock,
499
+ callback: fn (context: *Context) void,
500
+ context: *Context,
501
+ options: FormatOptions,
502
+ ) void {
503
+ assert(!superblock.opened);
504
+
505
+ assert(options.replica < config.replicas_max);
506
+ // TODO Assert that size_max exceeds the minimum comptime size of storage zones.
507
+ assert(options.size_max > superblock_zone_size);
508
+ assert(options.size_max % config.sector_size == 0);
509
+
510
+ // This working copy provides the parent checksum, and will not be written to disk.
511
+ // We therefore use zero values to make this parent checksum as stable as possible.
512
+ superblock.working.* = .{
513
+ .copy = 0,
514
+ .magic = .superblock,
515
+ .version = SuperBlockVersion,
516
+ .sequence = 0,
517
+ .replica = options.replica,
518
+ .cluster = options.cluster,
519
+ .size = 0,
520
+ .size_max = options.size_max,
521
+ .parent = 0,
522
+ .manifest_checksum = 0,
523
+ .free_set_checksum = 0,
524
+ .client_table_checksum = 0,
525
+ .vsr_state = .{
526
+ .commit_min = 0,
527
+ .commit_max = 0,
528
+ .view_normal = 0,
529
+ .view = 0,
530
+ },
531
+ .snapshots = undefined,
532
+ .manifest_size = 0,
533
+ .free_set_size = 0,
534
+ .client_table_size = 0,
535
+ };
536
+
537
+ mem.set(SuperBlockSector.Snapshot, &superblock.working.snapshots, .{
538
+ .created = 0,
539
+ .queried = 0,
540
+ .timeout = 0,
541
+ });
542
+
543
+ superblock.working.set_checksum();
544
+
545
+ superblock.staging.* = superblock.working.*;
546
+ superblock.staging.sequence = superblock.working.sequence + 1;
547
+ superblock.staging.parent = superblock.working.checksum;
548
+
549
+ context.* = .{
550
+ .superblock = superblock,
551
+ .callback = callback,
552
+ .caller = .format,
553
+ .copy = undefined,
554
+ };
555
+
556
+ // TODO At a higher layer, we must:
557
+ // 1. verify that there is no valid superblock, and
558
+ // 2. zero the superblock, WAL and client table to ensure storage determinism.
559
+
560
+ superblock.acquire(context);
561
+ }
562
+
563
+ pub fn open(
564
+ superblock: *SuperBlock,
565
+ callback: fn (context: *Context) void,
566
+ context: *Context,
567
+ ) void {
568
+ assert(!superblock.opened);
569
+
570
+ context.* = .{
571
+ .superblock = superblock,
572
+ .callback = callback,
573
+ .caller = .open,
574
+ };
575
+
576
+ superblock.acquire(context);
577
+ }
578
+
579
+ pub fn checkpoint(
580
+ superblock: *SuperBlock,
581
+ callback: fn (context: *Context) void,
582
+ context: *Context,
583
+ ) void {
584
+ assert(superblock.opened);
585
+
586
+ context.* = .{
587
+ .superblock = superblock,
588
+ .callback = callback,
589
+ .caller = .checkpoint,
590
+ .copy = undefined,
591
+ };
592
+
593
+ superblock.acquire(context);
594
+ }
595
+
596
+ pub fn view_change(
597
+ superblock: *SuperBlock,
598
+ callback: fn (context: *Context) void,
599
+ context: *Context,
600
+ vsr_state: SuperBlockSector.VSRState,
601
+ ) void {
602
+ assert(superblock.opened);
603
+
604
+ log.debug(
605
+ "view_change: commit_min={}..{} commit_max={}..{} view_normal={}..{} view={}..{}",
606
+ .{
607
+ superblock.working.vsr_state.commit_min,
608
+ vsr_state.commit_min,
609
+
610
+ superblock.working.vsr_state.commit_max,
611
+ vsr_state.commit_max,
612
+
613
+ superblock.working.vsr_state.view_normal,
614
+ vsr_state.view_normal,
615
+
616
+ superblock.working.vsr_state.view,
617
+ vsr_state.view,
618
+ },
619
+ );
620
+
621
+ assert(vsr_state.internally_consistent());
622
+
623
+ context.* = .{
624
+ .superblock = superblock,
625
+ .callback = callback,
626
+ .caller = .view_change,
627
+ .copy = undefined,
628
+ .vsr_state = vsr_state,
629
+ };
630
+
631
+ // Only this view_change() function may change the VSR state.
632
+ assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
633
+
634
+ if (!superblock.working.vsr_state.would_be_updated_by(context.vsr_state)) {
635
+ log.debug("view_change: no change", .{});
636
+ callback(context);
637
+ return;
638
+ }
639
+
640
+ superblock.acquire(context);
641
+ }
642
+
643
+ pub fn view_change_in_progress(superblock: *SuperBlock) bool {
644
+ assert(superblock.opened);
645
+
646
+ if (superblock.queue_head) |head| {
647
+ if (head.caller == .view_change) return true;
648
+ assert(head.caller == .checkpoint);
649
+ }
650
+
651
+ if (superblock.queue_tail) |tail| {
652
+ assert(tail.caller == .view_change);
653
+ return true;
654
+ }
655
+
656
+ return false;
657
+ }
658
+
659
+ fn write_staging(superblock: *SuperBlock, context: *Context) void {
660
+ assert(context.caller == .format or context.caller == .checkpoint);
661
+ assert(context.caller == .format or superblock.opened);
662
+ assert(superblock.queue_head == context);
663
+ assert(superblock.queue_tail == null);
664
+
665
+ superblock.write_staging_encode_manifest();
666
+ superblock.write_staging_encode_free_set();
667
+ superblock.write_staging_encode_client_table();
668
+
669
+ superblock.writing.* = superblock.staging.*;
670
+ superblock.writing.set_checksum();
671
+
672
+ assert(superblock.writing.sequence == superblock.working.sequence + 1);
673
+ assert(superblock.writing.parent == superblock.working.checksum);
674
+
675
+ superblock.staging.sequence = superblock.writing.sequence + 1;
676
+ superblock.staging.parent = superblock.writing.checksum;
677
+
678
+ assert(superblock.writing.manifest_checksum == superblock.staging.manifest_checksum);
679
+ assert(superblock.writing.free_set_checksum == superblock.staging.free_set_checksum);
680
+ assert(superblock.writing.client_table_checksum == superblock.staging.client_table_checksum);
681
+
682
+ assert(superblock.writing.manifest_size == superblock.staging.manifest_size);
683
+ assert(superblock.writing.free_set_size == superblock.staging.free_set_size);
684
+ assert(superblock.writing.client_table_size == superblock.staging.client_table_size);
685
+
686
+ context.copy = starting_copy_for_sequence(superblock.writing.sequence);
687
+ superblock.write_manifest(context);
688
+ }
689
+
690
+ fn write_staging_encode_manifest(superblock: *SuperBlock) void {
691
+ const staging: *SuperBlockSector = superblock.staging;
692
+ const target = superblock.manifest_buffer;
693
+
694
+ staging.manifest_size = @intCast(u32, superblock.manifest.encode(target));
695
+ staging.manifest_checksum = vsr.checksum(target[0..staging.manifest_size]);
696
+ }
697
+
698
+ fn write_staging_encode_free_set(superblock: *SuperBlock) void {
699
+ const staging: *SuperBlockSector = superblock.staging;
700
+ const encode_size_max = FreeSet.encode_size_max(config.block_count_max);
701
+ const target = superblock.free_set_buffer[0..encode_size_max];
702
+
703
+ superblock.free_set.include_staging();
704
+ defer superblock.free_set.exclude_staging();
705
+
706
+ superblock.verify_manifest_blocks_are_acquired_in_free_set();
707
+
708
+ staging.size = data_file_size_min;
709
+
710
+ if (superblock.free_set.highest_address_acquired()) |address| {
711
+ staging.size += address * config.block_size;
712
+ }
713
+
714
+ staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
715
+ staging.free_set_checksum = vsr.checksum(target[0..staging.free_set_size]);
716
+ }
717
+
718
+ fn write_staging_encode_client_table(superblock: *SuperBlock) void {
719
+ const staging: *SuperBlockSector = superblock.staging;
720
+ const target = superblock.client_table_buffer;
721
+
722
+ staging.client_table_size = @intCast(u32, superblock.client_table.encode(target));
723
+ staging.client_table_checksum = vsr.checksum(target[0..staging.client_table_size]);
724
+ }
725
+
726
+ fn write_view_change(superblock: *SuperBlock, context: *Context) void {
727
+ assert(context.caller == .view_change);
728
+ assert(superblock.opened);
729
+ assert(superblock.queue_head == context);
730
+ assert(superblock.queue_tail == null);
731
+ assert(context.vsr_state.internally_consistent());
732
+ assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
733
+ assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state));
734
+
735
+ superblock.writing.* = superblock.working.*;
736
+
737
+ // We cannot increment the sequence number when writing only the superblock sector as
738
+ // this would write the sector to another copy set with different superblock trailers.
739
+ // Instead, we increment twice so that the sector remains in the same copy set.
740
+ superblock.writing.sequence += 2;
741
+ assert(superblock.writing.parent == superblock.working.parent);
742
+
743
+ superblock.writing.vsr_state.update(context.vsr_state);
744
+ superblock.staging.vsr_state.update(context.vsr_state);
745
+
746
+ superblock.writing.set_checksum();
747
+
748
+ superblock.staging.sequence = superblock.writing.sequence + 1;
749
+ superblock.staging.parent = superblock.writing.checksum;
750
+
751
+ context.copy = starting_copy_for_sequence(superblock.writing.sequence);
752
+ superblock.write_sector(context);
753
+ }
754
+
755
+ fn write_manifest(superblock: *SuperBlock, context: *Context) void {
756
+ assert(superblock.queue_head == context);
757
+
758
+ const size = vsr.sector_ceil(superblock.writing.manifest_size);
759
+ assert(size <= superblock_trailer_manifest_size_max);
760
+
761
+ const buffer = superblock.manifest_buffer[0..size];
762
+ const offset = offset_manifest(context.copy, superblock.writing.sequence);
763
+
764
+ mem.set(u8, buffer[superblock.writing.manifest_size..], 0); // Zero sector padding.
765
+
766
+ assert(superblock.writing.manifest_checksum == vsr.checksum(
767
+ superblock.manifest_buffer[0..superblock.writing.manifest_size],
768
+ ));
769
+
770
+ log.debug("{s}: write_manifest: checksum={x} size={} offset={}", .{
771
+ @tagName(context.caller),
772
+ superblock.writing.manifest_checksum,
773
+ superblock.writing.manifest_size,
774
+ offset,
775
+ });
776
+
777
+ superblock.assert_bounds(offset, buffer.len);
778
+
779
+ if (buffer.len == 0) {
780
+ write_manifest_callback(&context.write);
781
+ return;
782
+ }
783
+
784
+ superblock.storage.write_sectors(
785
+ write_manifest_callback,
786
+ &context.write,
787
+ buffer,
788
+ .superblock,
789
+ offset,
790
+ );
791
+ }
792
+
793
+ fn write_manifest_callback(write: *Storage.Write) void {
794
+ const context = @fieldParentPtr(Context, "write", write);
795
+ context.superblock.write_free_set(context);
796
+ }
797
+
798
+ fn write_free_set(superblock: *SuperBlock, context: *Context) void {
799
+ assert(superblock.queue_head == context);
800
+
801
+ const size = vsr.sector_ceil(superblock.writing.free_set_size);
802
+ assert(size <= superblock_trailer_free_set_size_max);
803
+
804
+ const buffer = superblock.free_set_buffer[0..size];
805
+ const offset = offset_free_set(context.copy, superblock.writing.sequence);
806
+
807
+ mem.set(u8, buffer[superblock.writing.free_set_size..], 0); // Zero sector padding.
808
+
809
+ assert(superblock.writing.free_set_checksum == vsr.checksum(
810
+ superblock.free_set_buffer[0..superblock.writing.free_set_size],
811
+ ));
812
+
813
+ log.debug("{s}: write_free_set: checksum={x} size={} offset={}", .{
814
+ @tagName(context.caller),
815
+ superblock.writing.free_set_checksum,
816
+ superblock.writing.free_set_size,
817
+ offset,
818
+ });
819
+
820
+ superblock.assert_bounds(offset, buffer.len);
821
+
822
+ if (buffer.len == 0) {
823
+ write_free_set_callback(&context.write);
824
+ return;
825
+ }
826
+
827
+ superblock.storage.write_sectors(
828
+ write_free_set_callback,
829
+ &context.write,
830
+ buffer,
831
+ .superblock,
832
+ offset,
833
+ );
834
+ }
835
+
836
+ fn write_free_set_callback(write: *Storage.Write) void {
837
+ const context = @fieldParentPtr(Context, "write", write);
838
+ context.superblock.write_client_table(context);
839
+ }
840
+
841
+ fn write_client_table(superblock: *SuperBlock, context: *Context) void {
842
+ assert(superblock.queue_head == context);
843
+
844
+ const size = vsr.sector_ceil(superblock.writing.client_table_size);
845
+ assert(size <= superblock_trailer_client_table_size_max);
846
+
847
+ const buffer = superblock.client_table_buffer[0..size];
848
+ const offset = offset_client_table(context.copy, superblock.writing.sequence);
849
+
850
+ mem.set(u8, buffer[superblock.writing.client_table_size..], 0); // Zero sector padding.
851
+
852
+ assert(superblock.writing.client_table_checksum == vsr.checksum(
853
+ superblock.client_table_buffer[0..superblock.writing.client_table_size],
854
+ ));
855
+
856
+ log.debug("{s}: write_client_table: checksum={x} size={} offset={}", .{
857
+ @tagName(context.caller),
858
+ superblock.writing.client_table_checksum,
859
+ superblock.writing.client_table_size,
860
+ offset,
861
+ });
862
+
863
+ superblock.assert_bounds(offset, buffer.len);
864
+
865
+ if (buffer.len == 0) {
866
+ write_client_table_callback(&context.write);
867
+ return;
868
+ }
869
+
870
+ superblock.storage.write_sectors(
871
+ write_client_table_callback,
872
+ &context.write,
873
+ buffer,
874
+ .superblock,
875
+ offset,
876
+ );
877
+ }
878
+
879
+ fn write_client_table_callback(write: *Storage.Write) void {
880
+ const context = @fieldParentPtr(Context, "write", write);
881
+ context.superblock.write_sector(context);
882
+ }
883
+
884
+ fn write_sector(superblock: *SuperBlock, context: *Context) void {
885
+ assert(superblock.queue_head == context);
886
+
887
+ // We either update the working superblock for a checkpoint (+1) or a view change (+2):
888
+ assert(superblock.writing.sequence == superblock.working.sequence + 1 or
889
+ superblock.writing.sequence == superblock.working.sequence + 2);
890
+
891
+ // The staging superblock should always be one ahead, with VSR state in sync:
892
+ assert(superblock.staging.sequence == superblock.writing.sequence + 1);
893
+ assert(superblock.staging.parent == superblock.writing.checksum);
894
+ assert(meta.eql(superblock.staging.vsr_state, superblock.writing.vsr_state));
895
+
896
+ // The superblock cluster and replica should never change once formatted:
897
+ assert(superblock.writing.cluster == superblock.working.cluster);
898
+ assert(superblock.writing.cluster == superblock.staging.cluster);
899
+ assert(superblock.writing.replica == superblock.working.replica);
900
+ assert(superblock.writing.replica == superblock.staging.replica);
901
+
902
+ assert(context.copy < superblock_copies_max);
903
+ assert(context.copy >= starting_copy_for_sequence(superblock.writing.sequence));
904
+ assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
905
+ superblock.writing.copy = context.copy;
906
+
907
+ // Updating the copy number should not affect the checksum, which was previously set:
908
+ assert(superblock.writing.valid_checksum());
909
+
910
+ const buffer = mem.asBytes(superblock.writing);
911
+ const offset = superblock_size * context.copy;
912
+
913
+ log.debug("{s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
914
+ @tagName(context.caller),
915
+ superblock.writing.checksum,
916
+ superblock.writing.sequence,
917
+ context.copy,
918
+ buffer.len,
919
+ offset,
920
+ });
921
+
922
+ superblock.assert_bounds(offset, buffer.len + superblock_trailer_size_max);
923
+
924
+ superblock.storage.write_sectors(
925
+ write_sector_callback,
926
+ &context.write,
927
+ buffer,
928
+ .superblock,
929
+ offset,
930
+ );
931
+ }
932
+
933
+ fn write_sector_callback(write: *Storage.Write) void {
934
+ const context = @fieldParentPtr(Context, "write", write);
935
+ const superblock = context.superblock;
936
+
937
+ assert(superblock.queue_head == context);
938
+
939
+ assert(context.copy < superblock_copies_max);
940
+ assert(context.copy >= starting_copy_for_sequence(superblock.writing.sequence));
941
+ assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
942
+ assert(context.copy == superblock.writing.copy);
943
+
944
+ if (context.copy == stopping_copy_for_sequence(superblock.writing.sequence)) {
945
+ if (context.caller == .format and superblock.writing.sequence < 2) {
946
+ assert(superblock.writing.sequence != 0);
947
+
948
+ superblock.working.* = superblock.writing.*;
949
+ superblock.write_staging(context);
950
+ } else {
951
+ superblock.read_working(context);
952
+ }
953
+ } else {
954
+ context.copy += 1;
955
+
956
+ switch (context.caller) {
957
+ .open => unreachable,
958
+ .format, .checkpoint => superblock.write_manifest(context),
959
+ .view_change => superblock.write_sector(context),
960
+ }
961
+ }
962
+ }
963
+
964
+ fn read_working(superblock: *SuperBlock, context: *Context) void {
965
+ assert(superblock.queue_head == context);
966
+
967
+ // We do not submit reads in parallel, as while this would shave off 1ms, it would also
968
+ // increase the risk that a single fault applies to more reads due to temporal locality.
969
+ // This would make verification reads more flaky when we do experience a read fault.
970
+ // See "An Analysis of Data Corruption in the Storage Stack".
971
+
972
+ context.copy = 0; // Read all copies across all copy sets.
973
+ for (superblock.reading) |*copy| copy.* = undefined;
974
+ superblock.read_sector(context);
975
+ }
976
+
977
+ fn read_sector(superblock: *SuperBlock, context: *Context) void {
978
+ assert(superblock.queue_head == context);
979
+ assert(context.copy < superblock_copies_max);
980
+
981
+ const buffer = mem.asBytes(&superblock.reading[context.copy]);
982
+ const offset = superblock_size * context.copy;
983
+
984
+ log.debug("{s}: read_sector: copy={} size={} offset={}", .{
985
+ @tagName(context.caller),
986
+ context.copy,
987
+ buffer.len,
988
+ offset,
989
+ });
990
+
991
+ superblock.assert_bounds(offset, buffer.len + superblock_trailer_size_max);
992
+
993
+ superblock.storage.read_sectors(
994
+ read_sector_callback,
995
+ &context.read,
996
+ buffer,
997
+ .superblock,
998
+ offset,
999
+ );
1000
+ }
1001
+
1002
+ fn read_sector_callback(read: *Storage.Read) void {
1003
+ const context = @fieldParentPtr(Context, "read", read);
1004
+ const superblock = context.superblock;
1005
+
1006
+ assert(superblock.queue_head == context);
1007
+
1008
+ assert(context.copy < superblock_copies_max);
1009
+ if (context.copy == superblock_copies_max - 1) {
1010
+ const threshold = threshold_for_caller(context.caller);
1011
+
1012
+ if (superblock.quorums.working(superblock.reading, threshold)) |working| {
1013
+ switch (context.caller) {
1014
+ .format, .checkpoint, .view_change => {
1015
+ if (working.checksum != superblock.writing.checksum) {
1016
+ @panic("superblock failed verification after writing");
1017
+ }
1018
+ assert(working.equal(superblock.writing));
1019
+ assert(superblock.staging.sequence == working.sequence + 1);
1020
+ assert(superblock.staging.parent == working.checksum);
1021
+ },
1022
+ .open => {
1023
+ superblock.staging.* = working.*;
1024
+ superblock.staging.sequence = working.sequence + 1;
1025
+ superblock.staging.parent = working.checksum;
1026
+ },
1027
+ }
1028
+
1029
+ if (context.caller == .format) {
1030
+ assert(working.sequence == 2);
1031
+ // TODO Assert working.size.
1032
+ assert(working.manifest_size == 0);
1033
+ assert(working.free_set_size == 8);
1034
+ assert(working.vsr_state.commit_min == 0);
1035
+ assert(working.vsr_state.commit_max == 0);
1036
+ assert(working.vsr_state.view_normal == 0);
1037
+ assert(working.vsr_state.view == 0);
1038
+ } else if (context.caller == .checkpoint) {
1039
+ superblock.free_set.checkpoint();
1040
+ }
1041
+
1042
+ superblock.working.* = working.*;
1043
+ log.debug(
1044
+ "{s}: installed working superblock: checksum={x} sequence={} cluster={} " ++
1045
+ "replica={} size={} commit_min={} commit_max={} view_normal={} view={}",
1046
+ .{
1047
+ @tagName(context.caller),
1048
+ superblock.working.checksum,
1049
+ superblock.working.sequence,
1050
+ superblock.working.cluster,
1051
+ superblock.working.replica,
1052
+ superblock.working.size,
1053
+ superblock.working.vsr_state.commit_min,
1054
+ superblock.working.vsr_state.commit_max,
1055
+ superblock.working.vsr_state.view_normal,
1056
+ superblock.working.vsr_state.view,
1057
+ },
1058
+ );
1059
+
1060
+ if (context.caller == .open) {
1061
+ context.copy = starting_copy_for_sequence(superblock.working.sequence);
1062
+ superblock.read_manifest(context);
1063
+ } else {
1064
+ // TODO Consider calling TRIM() on Grid's free suffix after checkpointing.
1065
+ superblock.release(context);
1066
+ }
1067
+ } else |err| switch (err) {
1068
+ error.NotFound => @panic("superblock not found"),
1069
+ error.QuorumLost => @panic("superblock quorum lost"),
1070
+ error.ParentNotFound => @panic("superblock parent not found"),
1071
+ error.ParentQuorumLost => @panic("superblock parent quorum lost"),
1072
+ error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
1073
+ error.SequenceNotMonotonic => @panic("superblock sequence not monotonic"),
1074
+ }
1075
+ } else {
1076
+ context.copy += 1;
1077
+ superblock.read_sector(context);
1078
+ }
1079
+ }
1080
+
1081
+ fn read_manifest(superblock: *SuperBlock, context: *Context) void {
1082
+ assert(context.caller == .open);
1083
+ assert(superblock.queue_head == context);
1084
+ assert(context.copy < superblock_copies_max);
1085
+
1086
+ const size = vsr.sector_ceil(superblock.working.manifest_size);
1087
+ assert(size <= superblock_trailer_manifest_size_max);
1088
+
1089
+ const buffer = superblock.manifest_buffer[0..size];
1090
+ const offset = offset_manifest(context.copy, superblock.working.sequence);
1091
+
1092
+ log.debug("{s}: read_manifest: copy={} size={} offset={}", .{
1093
+ @tagName(context.caller),
1094
+ context.copy,
1095
+ buffer.len,
1096
+ offset,
1097
+ });
1098
+
1099
+ superblock.assert_bounds(offset, buffer.len);
1100
+
1101
+ if (buffer.len == 0) {
1102
+ read_manifest_callback(&context.read);
1103
+ return;
1104
+ }
1105
+
1106
+ superblock.storage.read_sectors(
1107
+ read_manifest_callback,
1108
+ &context.read,
1109
+ buffer,
1110
+ .superblock,
1111
+ offset,
1112
+ );
1113
+ }
1114
+
1115
+ fn read_manifest_callback(read: *Storage.Read) void {
1116
+ const context = @fieldParentPtr(Context, "read", read);
1117
+ const superblock = context.superblock;
1118
+
1119
+ assert(context.caller == .open);
1120
+ assert(superblock.queue_head == context);
1121
+ assert(!superblock.opened);
1122
+ assert(superblock.manifest.count == 0);
1123
+
1124
+ const slice = superblock.manifest_buffer[0..superblock.working.manifest_size];
1125
+ if (vsr.checksum(slice) == superblock.working.manifest_checksum) {
1126
+ superblock.manifest.decode(slice);
1127
+
1128
+ log.debug("open: read_manifest: manifest blocks: {}/{}", .{
1129
+ superblock.manifest.count,
1130
+ superblock.manifest.count_max,
1131
+ });
1132
+
1133
+ // TODO Repair any impaired copies before we continue.
1134
+ // At present, we repair at the next checkpoint.
1135
+ // We do not repair padding.
1136
+ context.copy = starting_copy_for_sequence(superblock.working.sequence);
1137
+ superblock.read_free_set(context);
1138
+ } else if (context.copy == stopping_copy_for_sequence(superblock.working.sequence)) {
1139
+ @panic("superblock manifest lost");
1140
+ } else {
1141
+ context.copy += 1;
1142
+ superblock.read_manifest(context);
1143
+ }
1144
+ }
1145
+
1146
+ fn read_free_set(superblock: *SuperBlock, context: *Context) void {
1147
+ assert(context.caller == .open);
1148
+ assert(superblock.queue_head == context);
1149
+ assert(context.copy < superblock_copies_max);
1150
+
1151
+ const size = vsr.sector_ceil(superblock.working.free_set_size);
1152
+ assert(size <= superblock_trailer_free_set_size_max);
1153
+
1154
+ const buffer = superblock.free_set_buffer[0..size];
1155
+ const offset = offset_free_set(context.copy, superblock.working.sequence);
1156
+
1157
+ log.debug("{s}: read_free_set: copy={} size={} offset={}", .{
1158
+ @tagName(context.caller),
1159
+ context.copy,
1160
+ buffer.len,
1161
+ offset,
1162
+ });
1163
+
1164
+ superblock.assert_bounds(offset, buffer.len);
1165
+
1166
+ if (buffer.len == 0) {
1167
+ read_free_set_callback(&context.read);
1168
+ return;
1169
+ }
1170
+
1171
+ superblock.storage.read_sectors(
1172
+ read_free_set_callback,
1173
+ &context.read,
1174
+ buffer,
1175
+ .superblock,
1176
+ offset,
1177
+ );
1178
+ }
1179
+
1180
+ fn read_free_set_callback(read: *Storage.Read) void {
1181
+ const context = @fieldParentPtr(Context, "read", read);
1182
+ const superblock = context.superblock;
1183
+
1184
+ assert(context.caller == .open);
1185
+ assert(superblock.queue_head == context);
1186
+ assert(!superblock.opened);
1187
+ assert(superblock.free_set.count_acquired() == 0);
1188
+
1189
+ const slice = superblock.free_set_buffer[0..superblock.working.free_set_size];
1190
+ if (vsr.checksum(slice) == superblock.working.free_set_checksum) {
1191
+ superblock.free_set.decode(slice);
1192
+
1193
+ log.debug("open: read_free_set: acquired blocks: {}/{}", .{
1194
+ superblock.free_set.count_acquired(),
1195
+ config.block_count_max,
1196
+ });
1197
+
1198
+ superblock.verify_manifest_blocks_are_acquired_in_free_set();
1199
+
1200
+ // TODO Repair any impaired copies before we continue.
1201
+ superblock.read_client_table(context);
1202
+ } else if (context.copy == stopping_copy_for_sequence(superblock.working.sequence)) {
1203
+ @panic("superblock free set lost");
1204
+ } else {
1205
+ context.copy += 1;
1206
+ superblock.read_free_set(context);
1207
+ }
1208
+ }
1209
+
1210
+ fn verify_manifest_blocks_are_acquired_in_free_set(superblock: *SuperBlock) void {
1211
+ assert(superblock.manifest.count <= superblock.free_set.count_acquired());
1212
+ for (superblock.manifest.addresses[0..superblock.manifest.count]) |address| {
1213
+ assert(!superblock.free_set.is_free(address));
1214
+ }
1215
+ }
1216
+
1217
+ fn read_client_table(superblock: *SuperBlock, context: *Context) void {
1218
+ assert(context.caller == .open);
1219
+ assert(superblock.queue_head == context);
1220
+ assert(context.copy < superblock_copies_max);
1221
+
1222
+ const size = vsr.sector_ceil(superblock.working.client_table_size);
1223
+ assert(size <= superblock_trailer_client_table_size_max);
1224
+
1225
+ const buffer = superblock.client_table_buffer[0..size];
1226
+ const offset = offset_client_table(context.copy, superblock.working.sequence);
1227
+
1228
+ log.debug("{s}: read_client_table: copy={} size={} offset={}", .{
1229
+ @tagName(context.caller),
1230
+ context.copy,
1231
+ buffer.len,
1232
+ offset,
1233
+ });
1234
+
1235
+ superblock.assert_bounds(offset, buffer.len);
1236
+
1237
+ if (buffer.len == 0) {
1238
+ read_client_table_callback(&context.read);
1239
+ return;
1240
+ }
1241
+
1242
+ superblock.storage.read_sectors(
1243
+ read_client_table_callback,
1244
+ &context.read,
1245
+ buffer,
1246
+ .superblock,
1247
+ offset,
1248
+ );
1249
+ }
1250
+
1251
+ fn read_client_table_callback(read: *Storage.Read) void {
1252
+ const context = @fieldParentPtr(Context, "read", read);
1253
+ const superblock = context.superblock;
1254
+
1255
+ assert(context.caller == .open);
1256
+ assert(superblock.queue_head == context);
1257
+ assert(!superblock.opened);
1258
+ assert(superblock.client_table.count() == 0);
1259
+
1260
+ const slice = superblock.client_table_buffer[0..superblock.working.client_table_size];
1261
+ if (vsr.checksum(slice) == superblock.working.client_table_checksum) {
1262
+ superblock.client_table.decode(slice);
1263
+
1264
+ log.debug("open: read_client_table: client requests: {}/{}", .{
1265
+ superblock.client_table.count(),
1266
+ config.clients_max,
1267
+ });
1268
+
1269
+ // TODO Repair any impaired copies before we continue.
1270
+ superblock.release(context);
1271
+ } else if (context.copy == stopping_copy_for_sequence(superblock.working.sequence)) {
1272
+ @panic("superblock client table lost");
1273
+ } else {
1274
+ context.copy += 1;
1275
+ superblock.read_client_table(context);
1276
+ }
1277
+ }
1278
+
1279
+ fn acquire(superblock: *SuperBlock, context: *Context) void {
1280
+ if (superblock.queue_head) |head| {
1281
+ // There should be nothing else happening when we format() or open():
1282
+ assert(context.caller != .format and context.caller != .open);
1283
+ assert(head.caller != .format and head.caller != .open);
1284
+
1285
+ // There may only be one checkpoint() and one view_change() submitted at a time:
1286
+ assert(head.caller != context.caller);
1287
+ assert(superblock.queue_tail == null);
1288
+
1289
+ log.debug("{s}: enqueued after {s}", .{
1290
+ @tagName(context.caller),
1291
+ @tagName(head.caller),
1292
+ });
1293
+
1294
+ superblock.queue_tail = context;
1295
+ } else {
1296
+ assert(superblock.queue_tail == null);
1297
+
1298
+ superblock.queue_head = context;
1299
+ log.debug("{s}: started", .{@tagName(context.caller)});
1300
+
1301
+ switch (context.caller) {
1302
+ .format => superblock.write_staging(context),
1303
+ .open => superblock.read_working(context),
1304
+ .checkpoint => superblock.write_staging(context),
1305
+ .view_change => superblock.write_view_change(context),
1306
+ }
1307
+ }
1308
+ }
1309
+
1310
+ fn release(superblock: *SuperBlock, context: *Context) void {
1311
+ assert(superblock.queue_head == context);
1312
+
1313
+ log.debug("{s}: complete", .{@tagName(context.caller)});
1314
+
1315
+ if (context.caller == .open) {
1316
+ assert(!superblock.opened);
1317
+ superblock.opened = true;
1318
+
1319
+ if (superblock.working.manifest_size > 0) {
1320
+ assert(superblock.manifest.count > 0);
1321
+ }
1322
+ if (superblock.working.free_set_size > @sizeOf(usize)) {
1323
+ assert(superblock.free_set.count_acquired() > 0);
1324
+ }
1325
+ } else if (context.caller == .view_change) {
1326
+ assert(meta.eql(superblock.working.vsr_state, context.vsr_state));
1327
+ assert(meta.eql(superblock.staging.vsr_state, context.vsr_state));
1328
+ }
1329
+
1330
+ const queue_tail = superblock.queue_tail;
1331
+ superblock.queue_head = null;
1332
+ superblock.queue_tail = null;
1333
+ if (queue_tail) |tail| superblock.acquire(tail);
1334
+
1335
+ context.callback(context);
1336
+ }
1337
+
1338
+ fn assert_bounds(superblock: *SuperBlock, offset: u64, size: u64) void {
1339
+ assert(offset >= superblock.storage_offset);
1340
+ assert(offset + size <= superblock.storage_offset + superblock.storage_size);
1341
+ }
1342
+
1343
+ fn offset_manifest(copy: u8, sequence: u64) u64 {
1344
+ assert(copy >= starting_copy_for_sequence(sequence));
1345
+ assert(copy <= stopping_copy_for_sequence(sequence));
1346
+
1347
+ return superblock_size * copy + @sizeOf(SuperBlockSector);
1348
+ }
1349
+
1350
+ fn offset_free_set(copy: u8, sequence: u64) u64 {
1351
+ assert(copy >= starting_copy_for_sequence(sequence));
1352
+ assert(copy <= stopping_copy_for_sequence(sequence));
1353
+
1354
+ return superblock_size * copy + @sizeOf(SuperBlockSector) +
1355
+ superblock_trailer_manifest_size_max;
1356
+ }
1357
+
1358
+ fn offset_client_table(copy: u8, sequence: u64) u64 {
1359
+ assert(copy >= starting_copy_for_sequence(sequence));
1360
+ assert(copy <= stopping_copy_for_sequence(sequence));
1361
+
1362
+ return superblock_size * copy + @sizeOf(SuperBlockSector) +
1363
+ superblock_trailer_manifest_size_max +
1364
+ superblock_trailer_free_set_size_max;
1365
+ }
1366
+
1367
+ /// Returns the first copy index (inclusive) to be written for a sequence number.
1368
+ fn starting_copy_for_sequence(sequence: u64) u8 {
1369
+ return config.superblock_copies * @intCast(u8, sequence % 2);
1370
+ }
1371
+
1372
+ /// Returns the last copy index (inclusive) to be written for a sequence number.
1373
+ fn stopping_copy_for_sequence(sequence: u64) u8 {
1374
+ return starting_copy_for_sequence(sequence) + config.superblock_copies - 1;
1375
+ }
1376
+
1377
+ /// We use flexible quorums for even quorums with write quorum > read quorum, for example:
1378
+ /// * When writing, we must verify that at least 3/4 copies were written.
1379
+ /// * At startup, we must verify that at least 2/4 copies were read.
1380
+ ///
1381
+ /// This ensures that our read and write quorums will intersect.
1382
+ /// Using flexible quorums in this way increases resiliency of the superblock.
1383
+ fn threshold_for_caller(caller: Context.Caller) u8 {
1384
+ // Working these threshold out by formula is easy to get wrong, so enumerate them:
1385
+ // The rule is that the write quorum plus the read quorum must be exactly copies + 1.
1386
+
1387
+ return switch (caller) {
1388
+ .format, .checkpoint, .view_change => switch (config.superblock_copies) {
1389
+ 4 => 3,
1390
+ 6 => 4,
1391
+ 8 => 5,
1392
+ else => unreachable,
1393
+ },
1394
+ // The open quorum must allow for at least two copy faults, because our view change
1395
+ // updates an existing set of copies in place, temporarily impairing one copy.
1396
+ .open => switch (config.superblock_copies) {
1397
+ 4 => 2,
1398
+ 6 => 3,
1399
+ 8 => 4,
1400
+ else => unreachable,
1401
+ },
1402
+ };
1403
+ }
1404
+ };
1405
+ }
1406
+
1407
+ const Quorums = struct {
1408
+ const Quorum = struct {
1409
+ sector: *const SuperBlockSector,
1410
+ count: QuorumCount = QuorumCount.initEmpty(),
1411
+ valid: bool = false,
1412
+ };
1413
+
1414
+ const QuorumCount = std.StaticBitSet(superblock_copies_max);
1415
+
1416
+ array: [superblock_copies_max]Quorum = undefined,
1417
+ count: u8 = 0,
1418
+
1419
+ pub const Error = error{
1420
+ NotFound,
1421
+ QuorumLost,
1422
+ ParentNotFound,
1423
+ ParentQuorumLost,
1424
+ SequenceNotMonotonic,
1425
+ VSRStateNotMonotonic,
1426
+ };
1427
+
1428
+ /// Returns the working superblock according to the quorum with the highest sequence number.
1429
+ /// Verifies that the highest quorum is connected, that the previous quorum was not lost.
1430
+ /// i.e. Both the working and previous quorum must be valid and intact and connected.
1431
+ /// Otherwise, we might regress to a previous working superblock.
1432
+ pub fn working(
1433
+ quorums: *Quorums,
1434
+ copies: []SuperBlockSector,
1435
+ threshold: u8,
1436
+ ) Error!*const SuperBlockSector {
1437
+ assert(copies.len == superblock_copies_max);
1438
+ assert(threshold >= 2 and threshold <= 5);
1439
+
1440
+ quorums.array = undefined;
1441
+ quorums.count = 0;
1442
+
1443
+ for (copies) |*copy, index| quorums.count_copy(copy, index, threshold);
1444
+
1445
+ std.sort.sort(Quorum, quorums.slice(), {}, sort_priority_descending);
1446
+
1447
+ for (quorums.slice()) |quorum| {
1448
+ if (quorum.count.count() == config.superblock_copies) {
1449
+ log.debug("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
1450
+ quorum.sector.checksum,
1451
+ quorum.sector.parent,
1452
+ quorum.sector.sequence,
1453
+ quorum.count.count(),
1454
+ quorum.valid,
1455
+ });
1456
+ } else {
1457
+ log.err("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
1458
+ quorum.sector.checksum,
1459
+ quorum.sector.parent,
1460
+ quorum.sector.sequence,
1461
+ quorum.count.count(),
1462
+ quorum.valid,
1463
+ });
1464
+ }
1465
+ }
1466
+
1467
+ // No working copies of any sequence number exist in the superblock storage zone at all.
1468
+ if (quorums.slice().len == 0) return error.NotFound;
1469
+
1470
+ // At least one copy or quorum exists.
1471
+ const b = quorums.slice()[0];
1472
+
1473
+ // Verify that the remaining quorums are correctly sorted:
1474
+ for (quorums.slice()[1..]) |a| {
1475
+ assert(sort_priority_descending({}, b, a));
1476
+ assert(a.sector.magic == .superblock);
1477
+ assert(a.sector.valid_checksum());
1478
+ }
1479
+
1480
+ // Even the best copy with the most quorum still has inadequate quorum.
1481
+ if (!b.valid) return error.QuorumLost;
1482
+
1483
+ // The superblock is only partially formatted, not all copies were written.
1484
+ if (b.sector.sequence < 2) return error.NotFound;
1485
+
1486
+ // Verify that the parent copy exists:
1487
+ for (quorums.slice()[1..]) |a| {
1488
+ if (a.sector.cluster != b.sector.cluster) {
1489
+ log.err("superblock copy={} has cluster={} instead of {}", .{
1490
+ a.sector.copy,
1491
+ a.sector.cluster,
1492
+ b.sector.cluster,
1493
+ });
1494
+ } else if (a.sector.replica != b.sector.replica) {
1495
+ log.err("superblock copy={} has replica={} instead of {}", .{
1496
+ a.sector.copy,
1497
+ a.sector.replica,
1498
+ b.sector.replica,
1499
+ });
1500
+ } else if (a.sector.checksum == b.sector.parent) {
1501
+ assert(a.sector.checksum != b.sector.checksum);
1502
+ assert(a.sector.cluster == b.sector.cluster);
1503
+ assert(a.sector.replica == b.sector.replica);
1504
+
1505
+ if (!a.valid) {
1506
+ return error.ParentQuorumLost;
1507
+ } else if (a.sector.sequence >= b.sector.sequence) {
1508
+ return error.SequenceNotMonotonic;
1509
+ } else if (a.sector.sequence % 2 == b.sector.sequence % 2) {
1510
+ // The parent must reside in the alternate copy to guarantee that we are able to
1511
+ // detect when the working quorum is lost.
1512
+ return error.SequenceNotMonotonic;
1513
+ } else if (!a.sector.vsr_state.monotonic(b.sector.vsr_state)) {
1514
+ return error.VSRStateNotMonotonic;
1515
+ } else {
1516
+ assert(b.sector.magic == .superblock);
1517
+ assert(b.sector.valid_checksum());
1518
+
1519
+ return b.sector;
1520
+ }
1521
+ }
1522
+ }
1523
+
1524
+ return error.ParentNotFound;
1525
+ }
1526
+
1527
+ fn count_copy(
1528
+ quorums: *Quorums,
1529
+ copy: *const SuperBlockSector,
1530
+ index: usize,
1531
+ threshold: u8,
1532
+ ) void {
1533
+ assert(index < superblock_copies_max);
1534
+ assert(threshold >= 2 and threshold <= 5);
1535
+
1536
+ if (!copy.valid_checksum()) {
1537
+ log.debug("copy: {}/{}: invalid checksum", .{ index, superblock_copies_max });
1538
+ return;
1539
+ }
1540
+
1541
+ if (copy.magic != .superblock) {
1542
+ log.debug("copy: {}/{}: not a superblock", .{ index, superblock_copies_max });
1543
+ return;
1544
+ }
1545
+
1546
+ if (copy.copy == index) {
1547
+ log.debug("copy: {}/{}: checksum={x} parent={x} sequence={}", .{
1548
+ index,
1549
+ superblock_copies_max,
1550
+ copy.checksum,
1551
+ copy.parent,
1552
+ copy.sequence,
1553
+ });
1554
+ } else {
1555
+ // If our read was misdirected, we definitely still want to count the copy.
1556
+ // We must just be careful to count it idempotently.
1557
+ log.err(
1558
+ "copy: {}/{}: checksum={x} parent={x} sequence={} misdirected from copy={}",
1559
+ .{
1560
+ index,
1561
+ superblock_copies_max,
1562
+ copy.checksum,
1563
+ copy.parent,
1564
+ copy.sequence,
1565
+ copy.copy,
1566
+ },
1567
+ );
1568
+ }
1569
+
1570
+ var quorum = quorums.find_or_insert_quorum_for_copy(copy);
1571
+ assert(quorum.sector.checksum == copy.checksum);
1572
+ assert(quorum.sector.equal(copy));
1573
+
1574
+ quorum.count.set(copy.copy);
1575
+ assert(quorum.count.isSet(copy.copy));
1576
+
1577
+ // In the worst case, all copies may contain divergent forks of the same sequence.
1578
+ // However, this should not happen for the same checksum.
1579
+ assert(quorum.count.count() <= config.superblock_copies);
1580
+
1581
+ quorum.valid = quorum.count.count() >= threshold;
1582
+ }
1583
+
1584
+ fn find_or_insert_quorum_for_copy(quorums: *Quorums, copy: *const SuperBlockSector) *Quorum {
1585
+ assert(copy.magic == .superblock);
1586
+ assert(copy.valid_checksum());
1587
+
1588
+ for (quorums.array[0..quorums.count]) |*quorum| {
1589
+ if (copy.checksum == quorum.sector.checksum) return quorum;
1590
+ } else {
1591
+ quorums.array[quorums.count] = Quorum{ .sector = copy };
1592
+ quorums.count += 1;
1593
+
1594
+ return &quorums.array[quorums.count - 1];
1595
+ }
1596
+ }
1597
+
1598
+ fn slice(quorums: *Quorums) []Quorum {
1599
+ return quorums.array[0..quorums.count];
1600
+ }
1601
+
1602
+ fn sort_priority_descending(_: void, a: Quorum, b: Quorum) bool {
1603
+ assert(a.sector.checksum != b.sector.checksum);
1604
+ assert(a.sector.magic == .superblock);
1605
+ assert(b.sector.magic == .superblock);
1606
+
1607
+ if (a.valid and !b.valid) return true;
1608
+ if (b.valid and !a.valid) return false;
1609
+
1610
+ if (a.sector.sequence > b.sector.sequence) return true;
1611
+ if (b.sector.sequence > a.sector.sequence) return false;
1612
+
1613
+ if (a.count.count() > b.count.count()) return true;
1614
+ if (b.count.count() > a.count.count()) return false;
1615
+
1616
+ // The sort order must be stable and deterministic:
1617
+ return a.sector.checksum > b.sector.checksum;
1618
+ }
1619
+ };
1620
+
1621
+ test "SuperBlockSector" {
1622
+ const expect = std.testing.expect;
1623
+
1624
+ var a = std.mem.zeroInit(SuperBlockSector, .{});
1625
+ a.set_checksum();
1626
+
1627
+ assert(a.copy == 0);
1628
+ try expect(a.valid_checksum());
1629
+
1630
+ a.copy += 1;
1631
+ try expect(a.valid_checksum());
1632
+
1633
+ a.replica += 1;
1634
+ try expect(!a.valid_checksum());
1635
+ }
1636
+
1637
+ // TODO Add unit tests for Quorums.
1638
+ // TODO Test invariants and transitions across TestRunner functions.
1639
+ // TODO Add a pristine in-memory test storage shim (we currently use real disk).
1640
+ const TestStorage = @import("../storage.zig").Storage;
1641
+ const TestSuperBlock = SuperBlockType(TestStorage);
1642
+
1643
+ const TestRunner = struct {
1644
+ superblock: *TestSuperBlock,
1645
+ context_format: TestSuperBlock.Context = undefined,
1646
+ context_open: TestSuperBlock.Context = undefined,
1647
+ context_checkpoint: TestSuperBlock.Context = undefined,
1648
+ context_view_change: TestSuperBlock.Context = undefined,
1649
+ pending: usize = 0,
1650
+
1651
+ fn format(runner: *TestRunner, options: TestSuperBlock.FormatOptions) void {
1652
+ runner.pending += 1;
1653
+ runner.superblock.format(format_callback, &runner.context_format, options);
1654
+ }
1655
+
1656
+ fn format_callback(context: *TestSuperBlock.Context) void {
1657
+ const runner = @fieldParentPtr(TestRunner, "context_format", context);
1658
+ runner.pending -= 1;
1659
+ runner.open();
1660
+ }
1661
+
1662
+ fn open(runner: *TestRunner) void {
1663
+ runner.pending += 1;
1664
+ runner.superblock.open(open_callback, &runner.context_open);
1665
+ }
1666
+
1667
+ fn open_callback(context: *TestSuperBlock.Context) void {
1668
+ const runner = @fieldParentPtr(TestRunner, "context_open", context);
1669
+ runner.pending -= 1;
1670
+ runner.checkpoint();
1671
+ runner.view_change();
1672
+ }
1673
+
1674
+ fn view_change(runner: *TestRunner) void {
1675
+ runner.pending += 1;
1676
+ runner.superblock.view_change(
1677
+ view_change_callback,
1678
+ &runner.context_view_change,
1679
+ .{
1680
+ .commit_min = runner.superblock.working.vsr_state.commit_min + 1,
1681
+ .commit_max = runner.superblock.working.vsr_state.commit_max + 2,
1682
+ .view_normal = runner.superblock.working.vsr_state.view_normal + 3,
1683
+ .view = runner.superblock.working.vsr_state.view + 4,
1684
+ },
1685
+ );
1686
+ }
1687
+
1688
+ fn view_change_callback(context: *TestSuperBlock.Context) void {
1689
+ const runner = @fieldParentPtr(TestRunner, "context_view_change", context);
1690
+ runner.pending -= 1;
1691
+ runner.checkpoint();
1692
+ }
1693
+
1694
+ fn checkpoint(runner: *TestRunner) void {
1695
+ runner.pending += 1;
1696
+ runner.superblock.checkpoint(checkpoint_callback, &runner.context_checkpoint);
1697
+ }
1698
+
1699
+ fn checkpoint_callback(context: *TestSuperBlock.Context) void {
1700
+ const runner = @fieldParentPtr(TestRunner, "context_checkpoint", context);
1701
+ runner.pending -= 1;
1702
+ }
1703
+ };
1704
+
1705
+ pub fn main() !void {
1706
+ const testing = std.testing;
1707
+ const allocator = testing.allocator;
1708
+
1709
+ const IO = @import("../io.zig").IO;
1710
+ const Storage = @import("../storage.zig").Storage;
1711
+
1712
+ const dir_path = ".";
1713
+ const dir_fd = os.openZ(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0) catch |err| {
1714
+ std.debug.print("failed to open directory '{s}': {}", .{ dir_path, err });
1715
+ return;
1716
+ };
1717
+
1718
+ const cluster = 32;
1719
+ const replica = 4;
1720
+ const size_max = 512 * 1024 * 1024;
1721
+
1722
+ const storage_fd = try Storage.open(dir_fd, "test_superblock", size_max, true);
1723
+ defer std.fs.cwd().deleteFile("test_superblock") catch {};
1724
+
1725
+ var io = try IO.init(128, 0);
1726
+ defer io.deinit();
1727
+
1728
+ var storage = try Storage.init(&io, size_max, storage_fd);
1729
+ defer storage.deinit();
1730
+
1731
+ var superblock = try TestSuperBlock.init(allocator, &storage);
1732
+ defer superblock.deinit(allocator);
1733
+
1734
+ var runner = TestRunner{ .superblock = &superblock };
1735
+
1736
+ runner.format(.{
1737
+ .cluster = cluster,
1738
+ .replica = replica,
1739
+ .size_max = size_max,
1740
+ });
1741
+
1742
+ while (runner.pending > 0) try io.run_for_ns(100);
1743
+ }