tigerbeetle-node 0.10.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +302 -101
  2. package/dist/index.d.ts +70 -72
  3. package/dist/index.js +70 -72
  4. package/dist/index.js.map +1 -1
  5. package/package.json +9 -8
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +6 -10
  8. package/src/node.zig +6 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +4 -4
  10. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  11. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  12. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  13. package/src/tigerbeetle/scripts/install.sh +19 -4
  14. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  15. package/src/tigerbeetle/scripts/install_zig.sh +24 -14
  16. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  17. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  18. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  19. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  20. package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
  21. package/src/tigerbeetle/src/benchmark.zig +29 -13
  22. package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
  23. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
  24. package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
  25. package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
  26. package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -257
  27. package/src/tigerbeetle/src/c/tb_client.h +118 -84
  28. package/src/tigerbeetle/src/c/tb_client.zig +88 -23
  29. package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
  30. package/src/tigerbeetle/src/c/test.zig +371 -1
  31. package/src/tigerbeetle/src/cli.zig +37 -7
  32. package/src/tigerbeetle/src/config.zig +58 -17
  33. package/src/tigerbeetle/src/demo.zig +5 -2
  34. package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
  35. package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
  36. package/src/tigerbeetle/src/ewah.zig +11 -33
  37. package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
  38. package/src/tigerbeetle/src/io/linux.zig +1 -1
  39. package/src/tigerbeetle/src/lsm/README.md +308 -0
  40. package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
  41. package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
  42. package/src/tigerbeetle/src/lsm/compaction.zig +376 -397
  43. package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
  44. package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
  45. package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
  46. package/src/tigerbeetle/src/lsm/forest.zig +21 -447
  47. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +414 -0
  48. package/src/tigerbeetle/src/lsm/grid.zig +170 -76
  49. package/src/tigerbeetle/src/lsm/groove.zig +197 -133
  50. package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
  51. package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
  52. package/src/tigerbeetle/src/lsm/manifest.zig +93 -180
  53. package/src/tigerbeetle/src/lsm/manifest_level.zig +161 -454
  54. package/src/tigerbeetle/src/lsm/manifest_log.zig +243 -356
  55. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
  56. package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
  57. package/src/tigerbeetle/src/lsm/posted_groove.zig +65 -76
  58. package/src/tigerbeetle/src/lsm/segmented_array.zig +580 -251
  59. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  60. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  61. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
  62. package/src/tigerbeetle/src/lsm/table.zig +115 -68
  63. package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
  64. package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -17
  65. package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
  66. package/src/tigerbeetle/src/lsm/test.zig +61 -56
  67. package/src/tigerbeetle/src/lsm/tree.zig +450 -407
  68. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +461 -0
  69. package/src/tigerbeetle/src/main.zig +83 -8
  70. package/src/tigerbeetle/src/message_bus.zig +20 -9
  71. package/src/tigerbeetle/src/message_pool.zig +22 -19
  72. package/src/tigerbeetle/src/ring_buffer.zig +7 -3
  73. package/src/tigerbeetle/src/simulator.zig +179 -119
  74. package/src/tigerbeetle/src/state_machine.zig +381 -246
  75. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  76. package/src/tigerbeetle/src/storage.zig +3 -7
  77. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  78. package/src/tigerbeetle/src/test/accounting/workload.zig +823 -0
  79. package/src/tigerbeetle/src/test/cluster.zig +33 -81
  80. package/src/tigerbeetle/src/test/conductor.zig +366 -0
  81. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  82. package/src/tigerbeetle/src/test/id.zig +89 -0
  83. package/src/tigerbeetle/src/test/network.zig +45 -19
  84. package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
  85. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  86. package/src/tigerbeetle/src/test/state_checker.zig +91 -69
  87. package/src/tigerbeetle/src/test/state_machine.zig +11 -35
  88. package/src/tigerbeetle/src/test/storage.zig +470 -106
  89. package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
  90. package/src/tigerbeetle/src/tigerbeetle.zig +15 -16
  91. package/src/tigerbeetle/src/unit_tests.zig +13 -1
  92. package/src/tigerbeetle/src/util.zig +97 -11
  93. package/src/tigerbeetle/src/vopr.zig +495 -0
  94. package/src/tigerbeetle/src/vsr/client.zig +21 -3
  95. package/src/tigerbeetle/src/vsr/journal.zig +293 -212
  96. package/src/tigerbeetle/src/vsr/replica.zig +1086 -515
  97. package/src/tigerbeetle/src/vsr/superblock.zig +382 -637
  98. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +14 -16
  99. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +416 -153
  100. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
  101. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
  102. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +62 -12
  103. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
  104. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
  105. package/src/tigerbeetle/src/vsr.zig +94 -60
  106. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  107. package/src/tigerbeetle/scripts/vopr.sh +0 -33
  108. package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
  109. package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
@@ -1,3 +1,14 @@
1
+ //! SuperBlock invariants:
2
+ //!
3
+ //! * vsr_state
4
+ //! - vsr_state.commit_min is initially 0 (for a newly-formatted replica).
5
+ //! - vsr_state.commit_min ≤ vsr_state.commit_max
6
+ //! - vsr_state.view_normal ≤ vsr_state.view
7
+ //! - checkpoint() must advance the superblock's vsr_state.commit_min.
8
+ //! - view_change() must not advance the superblock's vsr_state.commit_min.
9
+ //! - All fields of vsr_state except commit_min_checksum are monotonically increasing over
10
+ //! view_change()/checkpoint().
11
+ //!
1
12
  const std = @import("std");
2
13
  const assert = std.debug.assert;
3
14
  const crypto = std.crypto;
@@ -15,18 +26,11 @@ const MessagePool = @import("../message_pool.zig").MessagePool;
15
26
  pub const SuperBlockManifest = @import("superblock_manifest.zig").Manifest;
16
27
  pub const SuperBlockFreeSet = @import("superblock_free_set.zig").FreeSet;
17
28
  pub const SuperBlockClientTable = @import("superblock_client_table.zig").ClientTable;
29
+ pub const Quorums = @import("superblock_quorums.zig").QuorumsType(.{
30
+ .superblock_copies = config.superblock_copies,
31
+ });
18
32
 
19
- /// Identifies the type of a sector or block. Protects against misdirected I/O across valid types.
20
- pub const Magic = enum(u8) {
21
- superblock,
22
- manifest,
23
- prepare,
24
- index,
25
- filter,
26
- data,
27
- };
28
-
29
- pub const SuperBlockVersion: u8 = 0;
33
+ pub const SuperBlockVersion: u16 = 0;
30
34
 
31
35
  // Fields are aligned to work as an extern or packed struct.
32
36
  pub const SuperBlockSector = extern struct {
@@ -38,20 +42,20 @@ pub const SuperBlockSector = extern struct {
38
42
  /// This simplifies writing and comparing multiple copies.
39
43
  copy: u8 = 0,
40
44
 
41
- /// Protects against misdirected I/O for non-superblock sectors that have a valid checksum.
42
- magic: Magic,
45
+ /// Protects against writing to or reading from the wrong data file.
46
+ replica: u8,
43
47
 
44
48
  /// The version of the superblock format in use, reserved for major breaking changes.
45
- version: u8,
49
+ version: u16,
46
50
 
47
51
  /// Protects against writing to or reading from the wrong data file.
48
- replica: u8,
49
52
  cluster: u32,
50
53
 
51
54
  /// The current size of the data file.
52
55
  size: u64,
53
56
 
54
57
  /// The maximum size of the data file.
58
+ // TODO Actually limit the file to this size.
55
59
  size_max: u64,
56
60
 
57
61
  /// A monotonically increasing counter to locate the latest superblock at startup.
@@ -92,9 +96,12 @@ pub const SuperBlockSector = extern struct {
92
96
  /// The size of the client table entries stored in the superblock trailer.
93
97
  client_table_size: u32,
94
98
 
95
- reserved: [3160]u8 = [_]u8{0} ** 3160,
99
+ reserved: [3148]u8 = [_]u8{0} ** 3148,
96
100
 
97
101
  pub const VSRState = extern struct {
102
+ /// The vsr.Header.checksum of commit_min's message.
103
+ commit_min_checksum: u128,
104
+
98
105
  /// The last operation committed to the state machine. At startup, replay the log hereafter.
99
106
  commit_min: u64,
100
107
 
@@ -107,8 +114,22 @@ pub const SuperBlockSector = extern struct {
107
114
  /// The view number of the replica.
108
115
  view: u32,
109
116
 
117
+ reserved: [8]u8 = [_]u8{0} ** 8,
118
+
110
119
  comptime {
111
- assert(@sizeOf(VSRState) == 24);
120
+ assert(@sizeOf(VSRState) == 48);
121
+ // Assert that there is no implicit padding in the struct.
122
+ assert(@bitSizeOf(VSRState) == @sizeOf(VSRState) * 8);
123
+ }
124
+
125
+ pub fn root(cluster: u32) VSRState {
126
+ return .{
127
+ .commit_min_checksum = vsr.Header.root_prepare(cluster).checksum,
128
+ .commit_min = 0,
129
+ .commit_max = 0,
130
+ .view_normal = 0,
131
+ .view = 0,
132
+ };
112
133
  }
113
134
 
114
135
  pub fn internally_consistent(state: VSRState) bool {
@@ -118,6 +139,10 @@ pub const SuperBlockSector = extern struct {
118
139
  pub fn monotonic(old: VSRState, new: VSRState) bool {
119
140
  assert(old.internally_consistent());
120
141
  assert(new.internally_consistent());
142
+ // The last case is for when checking monotonic() from the sequence=0 sector.
143
+ assert(old.commit_min != new.commit_min or
144
+ old.commit_min_checksum == new.commit_min_checksum or
145
+ (old.commit_min_checksum == 0 and old.commit_min == 0));
121
146
 
122
147
  if (old.view > new.view) return false;
123
148
  if (old.view_normal > new.view_normal) return false;
@@ -137,6 +162,16 @@ pub const SuperBlockSector = extern struct {
137
162
  assert(state.would_be_updated_by(new));
138
163
  state.* = new;
139
164
  }
165
+
166
+ /// Compaction is one bar ahead of superblock's commit_min.
167
+ /// The commits from the bar following commit_min were in the mutable table, and
168
+ /// thus not preserved in the checkpoint.
169
+ /// But the corresponding `compact()` updates were preserved, and must not be repeated
170
+ /// to ensure determinstic storage.
171
+ pub fn op_compacted(state: VSRState, op: u64) bool {
172
+ // If commit_min is 0, we have never checkpointed, so no compactions are checkpointed.
173
+ return state.commit_min > 0 and op <= state.commit_min + config.lsm_batch_multiple;
174
+ }
140
175
  };
141
176
 
142
177
  pub const Snapshot = extern struct {
@@ -163,11 +198,15 @@ pub const SuperBlockSector = extern struct {
163
198
 
164
199
  comptime {
165
200
  assert(@sizeOf(Snapshot) == 24);
201
+ // Assert that there is no implicit padding in the struct.
202
+ assert(@bitSizeOf(Snapshot) == @sizeOf(Snapshot) * 8);
166
203
  }
167
204
  };
168
205
 
169
206
  comptime {
170
207
  assert(@sizeOf(SuperBlockSector) == config.sector_size);
208
+ // Assert that there is no implicit padding in the struct.
209
+ assert(@bitSizeOf(SuperBlockSector) == @sizeOf(SuperBlockSector) * 8);
171
210
  }
172
211
 
173
212
  pub fn calculate_checksum(superblock: *const SuperBlockSector) u128 {
@@ -186,12 +225,13 @@ pub const SuperBlockSector = extern struct {
186
225
  }
187
226
 
188
227
  pub fn set_checksum(superblock: *SuperBlockSector) void {
189
- assert(superblock.copy < superblock_copies_max);
190
- assert(superblock.magic == .superblock);
228
+ assert(superblock.copy < config.superblock_copies);
191
229
  assert(superblock.version == SuperBlockVersion);
192
230
  assert(superblock.flags == 0);
193
231
 
194
- for (mem.bytesAsSlice(u64, &superblock.reserved)) |word| assert(word == 0);
232
+ assert(@bitCast(u32, superblock.reserved[0..4].*) == 0);
233
+ for (mem.bytesAsSlice(u64, superblock.reserved[4..])) |word| assert(word == 0);
234
+ for (mem.bytesAsSlice(u64, &superblock.vsr_state.reserved)) |word| assert(word == 0);
195
235
 
196
236
  superblock.checksum = superblock.calculate_checksum();
197
237
  }
@@ -202,9 +242,6 @@ pub const SuperBlockSector = extern struct {
202
242
 
203
243
  /// Does not consider { checksum, copy } when comparing equality.
204
244
  pub fn equal(a: *const SuperBlockSector, b: *const SuperBlockSector) bool {
205
- assert(a.magic == .superblock);
206
- assert(b.magic == .superblock);
207
-
208
245
  if (a.version != b.version) return false;
209
246
  if (a.replica != b.replica) return false;
210
247
  if (a.cluster != b.cluster) return false;
@@ -221,8 +258,13 @@ pub const SuperBlockSector = extern struct {
221
258
  if (a.manifest_size != b.manifest_size) return false;
222
259
  if (a.free_set_size != b.free_set_size) return false;
223
260
 
224
- for (mem.bytesAsSlice(u64, &a.reserved)) |word| assert(word == 0);
225
- for (mem.bytesAsSlice(u64, &b.reserved)) |word| assert(word == 0);
261
+ assert(@bitCast(u32, a.reserved[0..4].*) == 0);
262
+ assert(@bitCast(u32, b.reserved[0..4].*) == 0);
263
+ for (mem.bytesAsSlice(u64, a.reserved[4..])) |word| assert(word == 0);
264
+ for (mem.bytesAsSlice(u64, b.reserved[4..])) |word| assert(word == 0);
265
+
266
+ for (mem.bytesAsSlice(u64, &a.vsr_state.reserved)) |word| assert(word == 0);
267
+ for (mem.bytesAsSlice(u64, &b.vsr_state.reserved)) |word| assert(word == 0);
226
268
 
227
269
  return true;
228
270
  }
@@ -236,20 +278,12 @@ comptime {
236
278
  }
237
279
 
238
280
  /// The size of the entire superblock storage zone.
239
- pub const superblock_zone_size = superblock_size * superblock_copies_max;
240
-
241
- /// A single set of copies (a copy set) consists of config.superblock_copies of a superblock.
242
- /// At least two copy sets are required for copy-on-write in order not to impair existing copies.
243
- ///
244
- /// However, when writing only the superblock sector for a view change, we do update-in-place,
245
- /// which is necessary as we need to continue to reference the existing superblock trailer to
246
- /// decouple view changes from checkpoints, to not force an untimely checkpoint ahead of schedule.
247
- pub const superblock_copies_max = config.superblock_copies * 2;
281
+ pub const superblock_zone_size = superblock_copy_size * config.superblock_copies;
248
282
 
249
283
  /// The size of an individual superblock including trailer.
250
- pub const superblock_size = @sizeOf(SuperBlockSector) + superblock_trailer_size_max;
284
+ pub const superblock_copy_size = @sizeOf(SuperBlockSector) + superblock_trailer_size_max;
251
285
  comptime {
252
- assert(superblock_size % config.sector_size == 0);
286
+ assert(superblock_copy_size % config.sector_size == 0);
253
287
  }
254
288
 
255
289
  /// The maximum possible size of the superblock trailer, following the superblock sector.
@@ -270,7 +304,9 @@ pub const superblock_trailer_size_max = blk: {
270
304
 
271
305
  // We order the smaller manifest section ahead of the block free set for better access locality.
272
306
  // For example, it's cheaper to skip over 1 MiB when reading from disk than to skip over 32 MiB.
273
- break :blk superblock_trailer_manifest_size_max + superblock_trailer_free_set_size_max + superblock_trailer_client_table_size_max;
307
+ break :blk superblock_trailer_manifest_size_max +
308
+ superblock_trailer_free_set_size_max +
309
+ superblock_trailer_client_table_size_max;
274
310
  };
275
311
 
276
312
  // A manifest block reference of 40 bytes contains a tree hash, checksum, and address.
@@ -280,6 +316,7 @@ pub const superblock_trailer_manifest_size_max = blk: {
280
316
 
281
317
  // Use a multiple of sector * reference so that the size is exactly divisible without padding:
282
318
  // For example, this 2.5 MiB manifest trailer == 65536 references == 65536 * 511 or 34m tables.
319
+ // TODO Size this relative to the expected number of tables & fragmentation.
283
320
  break :blk 16 * config.sector_size * SuperBlockManifest.BlockReferenceSize;
284
321
  };
285
322
 
@@ -303,6 +340,33 @@ pub const data_file_size_min = blk: {
303
340
  break :blk superblock_zone_size + config.journal_size_max;
304
341
  };
305
342
 
343
+ /// This table shows the sequence number progression of the SuperBlock's sectors.
344
+ ///
345
+ /// action working staging disk
346
+ /// format seq seq seq
347
+ /// 0 - Initially the file has no sectors.
348
+ /// 0 1 -
349
+ /// 0 1 1 Write a copyset for the first sequence.
350
+ /// 1 1 1 Read quorum; verify 3/4 are valid.
351
+ ///
352
+ /// open seq seq seq
353
+ /// a
354
+ /// a a Read quorum; verify 2/4 are valid.
355
+ /// a (a) a Repair any broken copies of `a`.
356
+ ///
357
+ /// checkpoint seq seq seq
358
+ /// a a a
359
+ /// a a+1
360
+ /// a a+1 a+1
361
+ /// a+1 a+1 a+1 Read quorum; verify 3/4 are valid.
362
+ ///
363
+ /// view_change seq seq seq
364
+ /// a a
365
+ /// a a+1 a The new sequence reuses the original parent.
366
+ /// a a+1 a+1
367
+ /// a+1 a+1 a+1 Read quorum; verify 3/4 are valid.
368
+ /// working staging disk
369
+ ///
306
370
  pub fn SuperBlockType(comptime Storage: type) type {
307
371
  return struct {
308
372
  const SuperBlock = @This();
@@ -325,8 +389,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
325
389
 
326
390
  write: Storage.Write = undefined,
327
391
  read: Storage.Read = undefined,
328
- copy: u8 = undefined,
329
- vsr_state: SuperBlockSector.VSRState = undefined,
392
+ read_threshold: ?Quorums.Threshold = null,
393
+ copy: ?u8 = null,
394
+ /// Used by format(), checkpoint(), and view_change().
395
+ vsr_state: ?SuperBlockSector.VSRState = null,
396
+ repairs: ?Quorums.RepairIterator = null, // Used by open().
330
397
  };
331
398
 
332
399
  storage: *Storage,
@@ -341,14 +408,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
341
408
  working: *align(config.sector_size) SuperBlockSector,
342
409
 
343
410
  /// The superblock that will replace the current working superblock once written.
344
- /// This is used when writing the staging superblock, or when changing views before then.
345
411
  /// We cannot mutate any working state directly until it is safely on stable storage.
346
412
  /// Otherwise, we may accidentally externalize guarantees that are not yet durable.
347
- writing: *align(config.sector_size) SuperBlockSector,
348
-
349
- /// The superblock that will be checkpointed next.
350
- /// This may be updated incrementally several times before the next checkpoint.
351
- /// For example, to track new snapshots as they are registered.
352
413
  staging: *align(config.sector_size) SuperBlockSector,
353
414
 
354
415
  /// The copies that we read into at startup or when verifying the written superblock.
@@ -402,11 +463,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
402
463
  const b = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
403
464
  errdefer allocator.free(b);
404
465
 
405
- const c = try allocator.allocAdvanced(SuperBlockSector, config.sector_size, 1, .exact);
406
- errdefer allocator.free(c);
407
-
408
466
  const reading = try allocator.allocAdvanced(
409
- [config.superblock_copies * 2]SuperBlockSector,
467
+ [config.superblock_copies]SuperBlockSector,
410
468
  config.sector_size,
411
469
  1,
412
470
  .exact,
@@ -456,8 +514,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
456
514
  return SuperBlock{
457
515
  .storage = storage,
458
516
  .working = &a[0],
459
- .writing = &b[0],
460
- .staging = &c[0],
517
+ .staging = &b[0],
461
518
  .reading = &reading[0],
462
519
  .manifest = manifest,
463
520
  .free_set = free_set,
@@ -469,11 +526,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
469
526
  }
470
527
 
471
528
  pub fn deinit(superblock: *SuperBlock, allocator: mem.Allocator) void {
472
- assert(superblock.queue_head == null);
473
- assert(superblock.queue_tail == null);
474
-
475
529
  allocator.destroy(superblock.working);
476
- allocator.destroy(superblock.writing);
477
530
  allocator.destroy(superblock.staging);
478
531
  allocator.free(superblock.reading);
479
532
 
@@ -503,15 +556,13 @@ pub fn SuperBlockType(comptime Storage: type) type {
503
556
  assert(!superblock.opened);
504
557
 
505
558
  assert(options.replica < config.replicas_max);
506
- // TODO Assert that size_max exceeds the minimum comptime size of storage zones.
507
- assert(options.size_max > superblock_zone_size);
559
+ assert(options.size_max >= data_file_size_min);
508
560
  assert(options.size_max % config.sector_size == 0);
509
561
 
510
562
  // This working copy provides the parent checksum, and will not be written to disk.
511
563
  // We therefore use zero values to make this parent checksum as stable as possible.
512
564
  superblock.working.* = .{
513
565
  .copy = 0,
514
- .magic = .superblock,
515
566
  .version = SuperBlockVersion,
516
567
  .sequence = 0,
517
568
  .replica = options.replica,
@@ -523,6 +574,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
523
574
  .free_set_checksum = 0,
524
575
  .client_table_checksum = 0,
525
576
  .vsr_state = .{
577
+ .commit_min_checksum = 0,
526
578
  .commit_min = 0,
527
579
  .commit_max = 0,
528
580
  .view_normal = 0,
@@ -542,15 +594,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
542
594
 
543
595
  superblock.working.set_checksum();
544
596
 
545
- superblock.staging.* = superblock.working.*;
546
- superblock.staging.sequence = superblock.working.sequence + 1;
547
- superblock.staging.parent = superblock.working.checksum;
548
-
549
597
  context.* = .{
550
598
  .superblock = superblock,
551
599
  .callback = callback,
552
600
  .caller = .format,
553
- .copy = undefined,
601
+ .vsr_state = SuperBlockSector.VSRState.root(options.cluster),
554
602
  };
555
603
 
556
604
  // TODO At a higher layer, we must:
@@ -576,23 +624,33 @@ pub fn SuperBlockType(comptime Storage: type) type {
576
624
  superblock.acquire(context);
577
625
  }
578
626
 
627
+ /// The vsr_state must update the commit_min and commit_min_checksum.
628
+ // TODO Will the replica ever update view/view_normal by calling checkpoint() during a view
629
+ // change? If not, forbid it.
579
630
  pub fn checkpoint(
580
631
  superblock: *SuperBlock,
581
632
  callback: fn (context: *Context) void,
582
633
  context: *Context,
634
+ vsr_state: SuperBlockSector.VSRState,
583
635
  ) void {
584
636
  assert(superblock.opened);
637
+ // Checkpoint must advance commit_min, but never the view.
638
+ assert(superblock.staging.vsr_state.would_be_updated_by(vsr_state));
639
+ assert(superblock.staging.vsr_state.commit_min < vsr_state.commit_min);
640
+ assert(superblock.staging.vsr_state.commit_min_checksum !=
641
+ vsr_state.commit_min_checksum);
585
642
 
586
643
  context.* = .{
587
644
  .superblock = superblock,
588
645
  .callback = callback,
589
646
  .caller = .checkpoint,
590
- .copy = undefined,
647
+ .vsr_state = vsr_state,
591
648
  };
592
649
 
593
650
  superblock.acquire(context);
594
651
  }
595
652
 
653
+ /// The vsr_state must not update the `commit_min` or `commit_min_checksum`.
596
654
  pub fn view_change(
597
655
  superblock: *SuperBlock,
598
656
  callback: fn (context: *Context) void,
@@ -600,20 +658,28 @@ pub fn SuperBlockType(comptime Storage: type) type {
600
658
  vsr_state: SuperBlockSector.VSRState,
601
659
  ) void {
602
660
  assert(superblock.opened);
661
+ assert(vsr_state.commit_min == superblock.staging.vsr_state.commit_min);
662
+ assert(vsr_state.commit_min_checksum ==
663
+ superblock.staging.vsr_state.commit_min_checksum);
664
+ assert(superblock.staging.vsr_state.monotonic(vsr_state));
603
665
 
604
666
  log.debug(
605
- "view_change: commit_min={}..{} commit_max={}..{} view_normal={}..{} view={}..{}",
667
+ "view_change: commit_min_checksum={}..{} commit_min={}..{} commit_max={}..{} " ++
668
+ "view_normal={}..{} view={}..{}",
606
669
  .{
607
- superblock.working.vsr_state.commit_min,
670
+ superblock.staging.vsr_state.commit_min_checksum,
671
+ vsr_state.commit_min_checksum,
672
+
673
+ superblock.staging.vsr_state.commit_min,
608
674
  vsr_state.commit_min,
609
675
 
610
- superblock.working.vsr_state.commit_max,
676
+ superblock.staging.vsr_state.commit_max,
611
677
  vsr_state.commit_max,
612
678
 
613
- superblock.working.vsr_state.view_normal,
679
+ superblock.staging.vsr_state.view_normal,
614
680
  vsr_state.view_normal,
615
681
 
616
- superblock.working.vsr_state.view,
682
+ superblock.staging.vsr_state.view,
617
683
  vsr_state.view,
618
684
  },
619
685
  );
@@ -624,14 +690,10 @@ pub fn SuperBlockType(comptime Storage: type) type {
624
690
  .superblock = superblock,
625
691
  .callback = callback,
626
692
  .caller = .view_change,
627
- .copy = undefined,
628
693
  .vsr_state = vsr_state,
629
694
  };
630
695
 
631
- // Only this view_change() function may change the VSR state.
632
- assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
633
-
634
- if (!superblock.working.vsr_state.would_be_updated_by(context.vsr_state)) {
696
+ if (!superblock.staging.vsr_state.would_be_updated_by(context.vsr_state.?)) {
635
697
  log.debug("view_change: no change", .{});
636
698
  callback(context);
637
699
  return;
@@ -657,34 +719,32 @@ pub fn SuperBlockType(comptime Storage: type) type {
657
719
  }
658
720
 
659
721
  fn write_staging(superblock: *SuperBlock, context: *Context) void {
660
- assert(context.caller == .format or context.caller == .checkpoint);
722
+ assert(context.caller != .open);
661
723
  assert(context.caller == .format or superblock.opened);
724
+ assert(context.copy == null);
725
+ assert(context.vsr_state.?.internally_consistent());
662
726
  assert(superblock.queue_head == context);
663
727
  assert(superblock.queue_tail == null);
728
+ assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state.?));
664
729
 
665
- superblock.write_staging_encode_manifest();
666
- superblock.write_staging_encode_free_set();
667
- superblock.write_staging_encode_client_table();
668
-
669
- superblock.writing.* = superblock.staging.*;
670
- superblock.writing.set_checksum();
671
-
672
- assert(superblock.writing.sequence == superblock.working.sequence + 1);
673
- assert(superblock.writing.parent == superblock.working.checksum);
674
-
675
- superblock.staging.sequence = superblock.writing.sequence + 1;
676
- superblock.staging.parent = superblock.writing.checksum;
677
-
678
- assert(superblock.writing.manifest_checksum == superblock.staging.manifest_checksum);
679
- assert(superblock.writing.free_set_checksum == superblock.staging.free_set_checksum);
680
- assert(superblock.writing.client_table_checksum == superblock.staging.client_table_checksum);
681
-
682
- assert(superblock.writing.manifest_size == superblock.staging.manifest_size);
683
- assert(superblock.writing.free_set_size == superblock.staging.free_set_size);
684
- assert(superblock.writing.client_table_size == superblock.staging.client_table_size);
730
+ superblock.staging.* = superblock.working.*;
731
+ superblock.staging.sequence = superblock.staging.sequence + 1;
732
+ superblock.staging.parent = superblock.staging.checksum;
733
+ superblock.staging.vsr_state.update(context.vsr_state.?);
734
+
735
+ if (context.caller != .view_change) {
736
+ superblock.write_staging_encode_manifest();
737
+ superblock.write_staging_encode_free_set();
738
+ superblock.write_staging_encode_client_table();
739
+ }
740
+ superblock.staging.set_checksum();
685
741
 
686
- context.copy = starting_copy_for_sequence(superblock.writing.sequence);
687
- superblock.write_manifest(context);
742
+ context.copy = 0;
743
+ if (context.caller == .view_change) {
744
+ superblock.write_sector(context);
745
+ } else {
746
+ superblock.write_manifest(context);
747
+ }
688
748
  }
689
749
 
690
750
  fn write_staging_encode_manifest(superblock: *SuperBlock) void {
@@ -710,6 +770,8 @@ pub fn SuperBlockType(comptime Storage: type) type {
710
770
  if (superblock.free_set.highest_address_acquired()) |address| {
711
771
  staging.size += address * config.block_size;
712
772
  }
773
+ assert(staging.size >= data_file_size_min);
774
+ assert(staging.size <= staging.size_max);
713
775
 
714
776
  staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
715
777
  staging.free_set_checksum = vsr.checksum(target[0..staging.free_set_size]);
@@ -723,54 +785,25 @@ pub fn SuperBlockType(comptime Storage: type) type {
723
785
  staging.client_table_checksum = vsr.checksum(target[0..staging.client_table_size]);
724
786
  }
725
787
 
726
- fn write_view_change(superblock: *SuperBlock, context: *Context) void {
727
- assert(context.caller == .view_change);
728
- assert(superblock.opened);
729
- assert(superblock.queue_head == context);
730
- assert(superblock.queue_tail == null);
731
- assert(context.vsr_state.internally_consistent());
732
- assert(meta.eql(superblock.working.vsr_state, superblock.staging.vsr_state));
733
- assert(superblock.working.vsr_state.would_be_updated_by(context.vsr_state));
734
-
735
- superblock.writing.* = superblock.working.*;
736
-
737
- // We cannot increment the sequence number when writing only the superblock sector as
738
- // this would write the sector to another copy set with different superblock trailers.
739
- // Instead, we increment twice so that the sector remains in the same copy set.
740
- superblock.writing.sequence += 2;
741
- assert(superblock.writing.parent == superblock.working.parent);
742
-
743
- superblock.writing.vsr_state.update(context.vsr_state);
744
- superblock.staging.vsr_state.update(context.vsr_state);
745
-
746
- superblock.writing.set_checksum();
747
-
748
- superblock.staging.sequence = superblock.writing.sequence + 1;
749
- superblock.staging.parent = superblock.writing.checksum;
750
-
751
- context.copy = starting_copy_for_sequence(superblock.writing.sequence);
752
- superblock.write_sector(context);
753
- }
754
-
755
788
  fn write_manifest(superblock: *SuperBlock, context: *Context) void {
756
789
  assert(superblock.queue_head == context);
757
790
 
758
- const size = vsr.sector_ceil(superblock.writing.manifest_size);
791
+ const size = vsr.sector_ceil(superblock.staging.manifest_size);
759
792
  assert(size <= superblock_trailer_manifest_size_max);
760
793
 
761
794
  const buffer = superblock.manifest_buffer[0..size];
762
- const offset = offset_manifest(context.copy, superblock.writing.sequence);
795
+ const offset = Layout.offset_manifest(context.copy.?);
763
796
 
764
- mem.set(u8, buffer[superblock.writing.manifest_size..], 0); // Zero sector padding.
797
+ mem.set(u8, buffer[superblock.staging.manifest_size..], 0); // Zero sector padding.
765
798
 
766
- assert(superblock.writing.manifest_checksum == vsr.checksum(
767
- superblock.manifest_buffer[0..superblock.writing.manifest_size],
799
+ assert(superblock.staging.manifest_checksum == vsr.checksum(
800
+ superblock.manifest_buffer[0..superblock.staging.manifest_size],
768
801
  ));
769
802
 
770
803
  log.debug("{s}: write_manifest: checksum={x} size={} offset={}", .{
771
804
  @tagName(context.caller),
772
- superblock.writing.manifest_checksum,
773
- superblock.writing.manifest_size,
805
+ superblock.staging.manifest_checksum,
806
+ superblock.staging.manifest_size,
774
807
  offset,
775
808
  });
776
809
 
@@ -798,22 +831,22 @@ pub fn SuperBlockType(comptime Storage: type) type {
798
831
  fn write_free_set(superblock: *SuperBlock, context: *Context) void {
799
832
  assert(superblock.queue_head == context);
800
833
 
801
- const size = vsr.sector_ceil(superblock.writing.free_set_size);
834
+ const size = vsr.sector_ceil(superblock.staging.free_set_size);
802
835
  assert(size <= superblock_trailer_free_set_size_max);
803
836
 
804
837
  const buffer = superblock.free_set_buffer[0..size];
805
- const offset = offset_free_set(context.copy, superblock.writing.sequence);
838
+ const offset = Layout.offset_free_set(context.copy.?);
806
839
 
807
- mem.set(u8, buffer[superblock.writing.free_set_size..], 0); // Zero sector padding.
840
+ mem.set(u8, buffer[superblock.staging.free_set_size..], 0); // Zero sector padding.
808
841
 
809
- assert(superblock.writing.free_set_checksum == vsr.checksum(
810
- superblock.free_set_buffer[0..superblock.writing.free_set_size],
842
+ assert(superblock.staging.free_set_checksum == vsr.checksum(
843
+ superblock.free_set_buffer[0..superblock.staging.free_set_size],
811
844
  ));
812
845
 
813
846
  log.debug("{s}: write_free_set: checksum={x} size={} offset={}", .{
814
847
  @tagName(context.caller),
815
- superblock.writing.free_set_checksum,
816
- superblock.writing.free_set_size,
848
+ superblock.staging.free_set_checksum,
849
+ superblock.staging.free_set_size,
817
850
  offset,
818
851
  });
819
852
 
@@ -841,22 +874,22 @@ pub fn SuperBlockType(comptime Storage: type) type {
841
874
  fn write_client_table(superblock: *SuperBlock, context: *Context) void {
842
875
  assert(superblock.queue_head == context);
843
876
 
844
- const size = vsr.sector_ceil(superblock.writing.client_table_size);
877
+ const size = vsr.sector_ceil(superblock.staging.client_table_size);
845
878
  assert(size <= superblock_trailer_client_table_size_max);
846
879
 
847
880
  const buffer = superblock.client_table_buffer[0..size];
848
- const offset = offset_client_table(context.copy, superblock.writing.sequence);
881
+ const offset = Layout.offset_client_table(context.copy.?);
849
882
 
850
- mem.set(u8, buffer[superblock.writing.client_table_size..], 0); // Zero sector padding.
883
+ mem.set(u8, buffer[superblock.staging.client_table_size..], 0); // Zero sector padding.
851
884
 
852
- assert(superblock.writing.client_table_checksum == vsr.checksum(
853
- superblock.client_table_buffer[0..superblock.writing.client_table_size],
885
+ assert(superblock.staging.client_table_checksum == vsr.checksum(
886
+ superblock.client_table_buffer[0..superblock.staging.client_table_size],
854
887
  ));
855
888
 
856
889
  log.debug("{s}: write_client_table: checksum={x} size={} offset={}", .{
857
890
  @tagName(context.caller),
858
- superblock.writing.client_table_checksum,
859
- superblock.writing.client_table_size,
891
+ superblock.staging.client_table_checksum,
892
+ superblock.staging.client_table_size,
860
893
  offset,
861
894
  });
862
895
 
@@ -884,42 +917,42 @@ pub fn SuperBlockType(comptime Storage: type) type {
884
917
  fn write_sector(superblock: *SuperBlock, context: *Context) void {
885
918
  assert(superblock.queue_head == context);
886
919
 
887
- // We either update the working superblock for a checkpoint (+1) or a view change (+2):
888
- assert(superblock.writing.sequence == superblock.working.sequence + 1 or
889
- superblock.writing.sequence == superblock.working.sequence + 2);
890
-
891
- // The staging superblock should always be one ahead, with VSR state in sync:
892
- assert(superblock.staging.sequence == superblock.writing.sequence + 1);
893
- assert(superblock.staging.parent == superblock.writing.checksum);
894
- assert(meta.eql(superblock.staging.vsr_state, superblock.writing.vsr_state));
920
+ // We update the working superblock for a checkpoint/format/view_change:
921
+ // open() does not update the working superblock, since it only writes to repair.
922
+ if (context.caller == .open) {
923
+ assert(superblock.staging.sequence == superblock.working.sequence);
924
+ } else {
925
+ assert(superblock.staging.sequence == superblock.working.sequence + 1);
926
+ assert(superblock.staging.parent == superblock.working.checksum);
927
+ }
895
928
 
896
929
  // The superblock cluster and replica should never change once formatted:
897
- assert(superblock.writing.cluster == superblock.working.cluster);
898
- assert(superblock.writing.cluster == superblock.staging.cluster);
899
- assert(superblock.writing.replica == superblock.working.replica);
900
- assert(superblock.writing.replica == superblock.staging.replica);
930
+ assert(superblock.staging.cluster == superblock.working.cluster);
931
+ assert(superblock.staging.replica == superblock.working.replica);
932
+
933
+ assert(superblock.staging.size >= data_file_size_min);
934
+ assert(superblock.staging.size <= superblock.staging.size_max);
901
935
 
902
- assert(context.copy < superblock_copies_max);
903
- assert(context.copy >= starting_copy_for_sequence(superblock.writing.sequence));
904
- assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
905
- superblock.writing.copy = context.copy;
936
+ assert(context.copy.? < config.superblock_copies);
937
+ superblock.staging.copy = context.copy.?;
906
938
 
907
939
  // Updating the copy number should not affect the checksum, which was previously set:
908
- assert(superblock.writing.valid_checksum());
940
+ assert(superblock.staging.valid_checksum());
909
941
 
910
- const buffer = mem.asBytes(superblock.writing);
911
- const offset = superblock_size * context.copy;
942
+ const buffer = mem.asBytes(superblock.staging);
943
+ const offset = Layout.offset_sector(context.copy.?);
912
944
 
913
- log.debug("{s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
945
+ log.debug("{}: {s}: write_sector: checksum={x} sequence={} copy={} size={} offset={}", .{
946
+ superblock.staging.replica,
914
947
  @tagName(context.caller),
915
- superblock.writing.checksum,
916
- superblock.writing.sequence,
917
- context.copy,
948
+ superblock.staging.checksum,
949
+ superblock.staging.sequence,
950
+ context.copy.?,
918
951
  buffer.len,
919
952
  offset,
920
953
  });
921
954
 
922
- superblock.assert_bounds(offset, buffer.len + superblock_trailer_size_max);
955
+ superblock.assert_bounds(offset, buffer.len);
923
956
 
924
957
  superblock.storage.write_sectors(
925
958
  write_sector_callback,
@@ -933,25 +966,24 @@ pub fn SuperBlockType(comptime Storage: type) type {
933
966
  fn write_sector_callback(write: *Storage.Write) void {
934
967
  const context = @fieldParentPtr(Context, "write", write);
935
968
  const superblock = context.superblock;
969
+ const copy = context.copy.?;
936
970
 
937
971
  assert(superblock.queue_head == context);
938
972
 
939
- assert(context.copy < superblock_copies_max);
940
- assert(context.copy >= starting_copy_for_sequence(superblock.writing.sequence));
941
- assert(context.copy <= stopping_copy_for_sequence(superblock.writing.sequence));
942
- assert(context.copy == superblock.writing.copy);
973
+ assert(copy < config.superblock_copies);
974
+ assert(copy == superblock.staging.copy);
943
975
 
944
- if (context.copy == stopping_copy_for_sequence(superblock.writing.sequence)) {
945
- if (context.caller == .format and superblock.writing.sequence < 2) {
946
- assert(superblock.writing.sequence != 0);
976
+ if (context.caller == .open) {
977
+ context.copy = null;
978
+ superblock.repair(context);
979
+ return;
980
+ }
947
981
 
948
- superblock.working.* = superblock.writing.*;
949
- superblock.write_staging(context);
950
- } else {
951
- superblock.read_working(context);
952
- }
982
+ if (copy + 1 == config.superblock_copies) {
983
+ context.copy = null;
984
+ superblock.read_working(context, .verify);
953
985
  } else {
954
- context.copy += 1;
986
+ context.copy = copy + 1;
955
987
 
956
988
  switch (context.caller) {
957
989
  .open => unreachable,
@@ -961,34 +993,42 @@ pub fn SuperBlockType(comptime Storage: type) type {
961
993
  }
962
994
  }
963
995
 
964
- fn read_working(superblock: *SuperBlock, context: *Context) void {
996
+ fn read_working(
997
+ superblock: *SuperBlock,
998
+ context: *Context,
999
+ threshold: Quorums.Threshold,
1000
+ ) void {
965
1001
  assert(superblock.queue_head == context);
1002
+ assert(context.copy == null);
1003
+ assert(context.read_threshold == null);
966
1004
 
967
1005
  // We do not submit reads in parallel, as while this would shave off 1ms, it would also
968
1006
  // increase the risk that a single fault applies to more reads due to temporal locality.
969
1007
  // This would make verification reads more flaky when we do experience a read fault.
970
1008
  // See "An Analysis of Data Corruption in the Storage Stack".
971
1009
 
972
- context.copy = 0; // Read all copies across all copy sets.
1010
+ context.copy = 0;
1011
+ context.read_threshold = threshold;
973
1012
  for (superblock.reading) |*copy| copy.* = undefined;
974
1013
  superblock.read_sector(context);
975
1014
  }
976
1015
 
977
1016
  fn read_sector(superblock: *SuperBlock, context: *Context) void {
978
1017
  assert(superblock.queue_head == context);
979
- assert(context.copy < superblock_copies_max);
1018
+ assert(context.copy.? < config.superblock_copies);
1019
+ assert(context.read_threshold != null);
980
1020
 
981
- const buffer = mem.asBytes(&superblock.reading[context.copy]);
982
- const offset = superblock_size * context.copy;
1021
+ const buffer = mem.asBytes(&superblock.reading[context.copy.?]);
1022
+ const offset = Layout.offset_sector(context.copy.?);
983
1023
 
984
1024
  log.debug("{s}: read_sector: copy={} size={} offset={}", .{
985
1025
  @tagName(context.caller),
986
- context.copy,
1026
+ context.copy.?,
987
1027
  buffer.len,
988
1028
  offset,
989
1029
  });
990
1030
 
991
- superblock.assert_bounds(offset, buffer.len + superblock_trailer_size_max);
1031
+ superblock.assert_bounds(offset, buffer.len);
992
1032
 
993
1033
  superblock.storage.read_sectors(
994
1034
  read_sector_callback,
@@ -1002,96 +1042,109 @@ pub fn SuperBlockType(comptime Storage: type) type {
1002
1042
  fn read_sector_callback(read: *Storage.Read) void {
1003
1043
  const context = @fieldParentPtr(Context, "read", read);
1004
1044
  const superblock = context.superblock;
1045
+ const threshold = context.read_threshold.?;
1005
1046
 
1006
1047
  assert(superblock.queue_head == context);
1007
1048
 
1008
- assert(context.copy < superblock_copies_max);
1009
- if (context.copy == superblock_copies_max - 1) {
1010
- const threshold = threshold_for_caller(context.caller);
1011
-
1012
- if (superblock.quorums.working(superblock.reading, threshold)) |working| {
1013
- switch (context.caller) {
1014
- .format, .checkpoint, .view_change => {
1015
- if (working.checksum != superblock.writing.checksum) {
1016
- @panic("superblock failed verification after writing");
1017
- }
1018
- assert(working.equal(superblock.writing));
1019
- assert(superblock.staging.sequence == working.sequence + 1);
1020
- assert(superblock.staging.parent == working.checksum);
1021
- },
1022
- .open => {
1023
- superblock.staging.* = working.*;
1024
- superblock.staging.sequence = working.sequence + 1;
1025
- superblock.staging.parent = working.checksum;
1026
- },
1027
- }
1049
+ assert(context.copy.? < config.superblock_copies);
1050
+ if (context.copy.? + 1 != config.superblock_copies) {
1051
+ context.copy = context.copy.? + 1;
1052
+ superblock.read_sector(context);
1053
+ return;
1054
+ }
1028
1055
 
1029
- if (context.caller == .format) {
1030
- assert(working.sequence == 2);
1031
- // TODO Assert working.size.
1032
- assert(working.manifest_size == 0);
1033
- assert(working.free_set_size == 8);
1034
- assert(working.vsr_state.commit_min == 0);
1035
- assert(working.vsr_state.commit_max == 0);
1036
- assert(working.vsr_state.view_normal == 0);
1037
- assert(working.vsr_state.view == 0);
1038
- } else if (context.caller == .checkpoint) {
1039
- superblock.free_set.checkpoint();
1056
+ context.read_threshold = null;
1057
+ context.copy = null;
1058
+
1059
+ if (superblock.quorums.working(superblock.reading, threshold)) |quorum| {
1060
+ assert(quorum.valid);
1061
+ assert(quorum.copies.count() >= threshold.count());
1062
+
1063
+ const working = quorum.sector;
1064
+ if (threshold == .verify) {
1065
+ if (working.checksum != superblock.staging.checksum) {
1066
+ @panic("superblock failed verification after writing");
1040
1067
  }
1068
+ assert(working.equal(superblock.staging));
1069
+ }
1041
1070
 
1042
- superblock.working.* = working.*;
1043
- log.debug(
1044
- "{s}: installed working superblock: checksum={x} sequence={} cluster={} " ++
1045
- "replica={} size={} commit_min={} commit_max={} view_normal={} view={}",
1046
- .{
1047
- @tagName(context.caller),
1048
- superblock.working.checksum,
1049
- superblock.working.sequence,
1050
- superblock.working.cluster,
1051
- superblock.working.replica,
1052
- superblock.working.size,
1053
- superblock.working.vsr_state.commit_min,
1054
- superblock.working.vsr_state.commit_max,
1055
- superblock.working.vsr_state.view_normal,
1056
- superblock.working.vsr_state.view,
1057
- },
1058
- );
1059
-
1060
- if (context.caller == .open) {
1061
- context.copy = starting_copy_for_sequence(superblock.working.sequence);
1062
- superblock.read_manifest(context);
1063
- } else {
1064
- // TODO Consider calling TRIM() on Grid's free suffix after checkpointing.
1071
+ if (context.caller == .format) {
1072
+ assert(working.sequence == 1);
1073
+ assert(working.size == data_file_size_min);
1074
+ assert(working.manifest_size == 0);
1075
+ assert(working.free_set_size == 8);
1076
+ assert(working.client_table_size == 4);
1077
+ assert(working.vsr_state.commit_min_checksum ==
1078
+ vsr.Header.root_prepare(working.cluster).checksum);
1079
+ assert(working.vsr_state.commit_min == 0);
1080
+ assert(working.vsr_state.commit_max == 0);
1081
+ assert(working.vsr_state.view_normal == 0);
1082
+ assert(working.vsr_state.view == 0);
1083
+ } else if (context.caller == .checkpoint) {
1084
+ superblock.free_set.checkpoint();
1085
+ }
1086
+
1087
+ superblock.working.* = working.*;
1088
+ superblock.staging.* = working.*;
1089
+ log.debug(
1090
+ "{s}: installed working superblock: checksum={x} sequence={} cluster={} " ++
1091
+ "replica={} size={} " ++
1092
+ "commit_min_checksum={} commit_min={} commit_max={} " ++
1093
+ "view_normal={} view={}",
1094
+ .{
1095
+ @tagName(context.caller),
1096
+ superblock.working.checksum,
1097
+ superblock.working.sequence,
1098
+ superblock.working.cluster,
1099
+ superblock.working.replica,
1100
+ superblock.working.size,
1101
+ superblock.working.vsr_state.commit_min_checksum,
1102
+ superblock.working.vsr_state.commit_min,
1103
+ superblock.working.vsr_state.commit_max,
1104
+ superblock.working.vsr_state.view_normal,
1105
+ superblock.working.vsr_state.view,
1106
+ },
1107
+ );
1108
+
1109
+ if (context.caller == .open) {
1110
+ if (context.repairs) |_| {
1111
+ // We just verified that the repair completed.
1112
+ assert(threshold == .verify);
1065
1113
  superblock.release(context);
1114
+ } else {
1115
+ assert(threshold == .open);
1116
+ context.copy = 0;
1117
+ context.repairs = quorum.repairs();
1118
+ superblock.read_manifest(context);
1066
1119
  }
1067
- } else |err| switch (err) {
1068
- error.NotFound => @panic("superblock not found"),
1069
- error.QuorumLost => @panic("superblock quorum lost"),
1070
- error.ParentNotFound => @panic("superblock parent not found"),
1071
- error.ParentQuorumLost => @panic("superblock parent quorum lost"),
1072
- error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
1073
- error.SequenceNotMonotonic => @panic("superblock sequence not monotonic"),
1120
+ } else {
1121
+ // TODO Consider calling TRIM() on Grid's free suffix after checkpointing.
1122
+ superblock.release(context);
1074
1123
  }
1075
- } else {
1076
- context.copy += 1;
1077
- superblock.read_sector(context);
1124
+ } else |err| switch (err) {
1125
+ error.Fork => @panic("superblock forked"),
1126
+ error.NotFound => @panic("superblock not found"),
1127
+ error.QuorumLost => @panic("superblock quorum lost"),
1128
+ error.ParentNotConnected => @panic("superblock parent not connected"),
1129
+ error.ParentSkipped => @panic("superblock parent superseded"),
1130
+ error.VSRStateNotMonotonic => @panic("superblock vsr state not monotonic"),
1078
1131
  }
1079
1132
  }
1080
1133
 
1081
1134
  fn read_manifest(superblock: *SuperBlock, context: *Context) void {
1082
1135
  assert(context.caller == .open);
1083
1136
  assert(superblock.queue_head == context);
1084
- assert(context.copy < superblock_copies_max);
1137
+ assert(context.copy.? < config.superblock_copies);
1085
1138
 
1086
1139
  const size = vsr.sector_ceil(superblock.working.manifest_size);
1087
1140
  assert(size <= superblock_trailer_manifest_size_max);
1088
1141
 
1089
1142
  const buffer = superblock.manifest_buffer[0..size];
1090
- const offset = offset_manifest(context.copy, superblock.working.sequence);
1143
+ const offset = Layout.offset_manifest(context.copy.?);
1091
1144
 
1092
1145
  log.debug("{s}: read_manifest: copy={} size={} offset={}", .{
1093
1146
  @tagName(context.caller),
1094
- context.copy,
1147
+ context.copy.?,
1095
1148
  buffer.len,
1096
1149
  offset,
1097
1150
  });
@@ -1115,6 +1168,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
1115
1168
  fn read_manifest_callback(read: *Storage.Read) void {
1116
1169
  const context = @fieldParentPtr(Context, "read", read);
1117
1170
  const superblock = context.superblock;
1171
+ const copy = context.copy.?;
1118
1172
 
1119
1173
  assert(context.caller == .open);
1120
1174
  assert(superblock.queue_head == context);
@@ -1133,12 +1187,13 @@ pub fn SuperBlockType(comptime Storage: type) type {
1133
1187
  // TODO Repair any impaired copies before we continue.
1134
1188
  // At present, we repair at the next checkpoint.
1135
1189
  // We do not repair padding.
1136
- context.copy = starting_copy_for_sequence(superblock.working.sequence);
1190
+ context.copy = 0;
1137
1191
  superblock.read_free_set(context);
1138
- } else if (context.copy == stopping_copy_for_sequence(superblock.working.sequence)) {
1192
+ } else if (copy + 1 == config.superblock_copies) {
1139
1193
  @panic("superblock manifest lost");
1140
1194
  } else {
1141
- context.copy += 1;
1195
+ log.debug("open: read_manifest: corrupt copy={}", .{copy});
1196
+ context.copy = copy + 1;
1142
1197
  superblock.read_manifest(context);
1143
1198
  }
1144
1199
  }
@@ -1146,17 +1201,17 @@ pub fn SuperBlockType(comptime Storage: type) type {
1146
1201
  fn read_free_set(superblock: *SuperBlock, context: *Context) void {
1147
1202
  assert(context.caller == .open);
1148
1203
  assert(superblock.queue_head == context);
1149
- assert(context.copy < superblock_copies_max);
1204
+ assert(context.copy.? < config.superblock_copies);
1150
1205
 
1151
1206
  const size = vsr.sector_ceil(superblock.working.free_set_size);
1152
1207
  assert(size <= superblock_trailer_free_set_size_max);
1153
1208
 
1154
1209
  const buffer = superblock.free_set_buffer[0..size];
1155
- const offset = offset_free_set(context.copy, superblock.working.sequence);
1210
+ const offset = Layout.offset_free_set(context.copy.?);
1156
1211
 
1157
1212
  log.debug("{s}: read_free_set: copy={} size={} offset={}", .{
1158
1213
  @tagName(context.caller),
1159
- context.copy,
1214
+ context.copy.?,
1160
1215
  buffer.len,
1161
1216
  offset,
1162
1217
  });
@@ -1180,6 +1235,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
1180
1235
  fn read_free_set_callback(read: *Storage.Read) void {
1181
1236
  const context = @fieldParentPtr(Context, "read", read);
1182
1237
  const superblock = context.superblock;
1238
+ const copy = context.copy.?;
1183
1239
 
1184
1240
  assert(context.caller == .open);
1185
1241
  assert(superblock.queue_head == context);
@@ -1199,10 +1255,11 @@ pub fn SuperBlockType(comptime Storage: type) type {
1199
1255
 
1200
1256
  // TODO Repair any impaired copies before we continue.
1201
1257
  superblock.read_client_table(context);
1202
- } else if (context.copy == stopping_copy_for_sequence(superblock.working.sequence)) {
1258
+ } else if (copy + 1 == config.superblock_copies) {
1203
1259
  @panic("superblock free set lost");
1204
1260
  } else {
1205
- context.copy += 1;
1261
+ log.debug("open: read_free_set: corrupt copy={}", .{copy});
1262
+ context.copy = copy + 1;
1206
1263
  superblock.read_free_set(context);
1207
1264
  }
1208
1265
  }
@@ -1217,17 +1274,17 @@ pub fn SuperBlockType(comptime Storage: type) type {
1217
1274
  fn read_client_table(superblock: *SuperBlock, context: *Context) void {
1218
1275
  assert(context.caller == .open);
1219
1276
  assert(superblock.queue_head == context);
1220
- assert(context.copy < superblock_copies_max);
1277
+ assert(context.copy.? < config.superblock_copies);
1221
1278
 
1222
1279
  const size = vsr.sector_ceil(superblock.working.client_table_size);
1223
1280
  assert(size <= superblock_trailer_client_table_size_max);
1224
1281
 
1225
1282
  const buffer = superblock.client_table_buffer[0..size];
1226
- const offset = offset_client_table(context.copy, superblock.working.sequence);
1283
+ const offset = Layout.offset_client_table(context.copy.?);
1227
1284
 
1228
1285
  log.debug("{s}: read_client_table: copy={} size={} offset={}", .{
1229
1286
  @tagName(context.caller),
1230
- context.copy,
1287
+ context.copy.?,
1231
1288
  buffer.len,
1232
1289
  offset,
1233
1290
  });
@@ -1251,6 +1308,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
1251
1308
  fn read_client_table_callback(read: *Storage.Read) void {
1252
1309
  const context = @fieldParentPtr(Context, "read", read);
1253
1310
  const superblock = context.superblock;
1311
+ const copy = context.copy.?;
1254
1312
 
1255
1313
  assert(context.caller == .open);
1256
1314
  assert(superblock.queue_head == context);
@@ -1266,16 +1324,33 @@ pub fn SuperBlockType(comptime Storage: type) type {
1266
1324
  config.clients_max,
1267
1325
  });
1268
1326
 
1269
- // TODO Repair any impaired copies before we continue.
1270
- superblock.release(context);
1271
- } else if (context.copy == stopping_copy_for_sequence(superblock.working.sequence)) {
1327
+ context.copy = null;
1328
+ superblock.repair(context);
1329
+ } else if (copy + 1 == config.superblock_copies) {
1272
1330
  @panic("superblock client table lost");
1273
1331
  } else {
1274
- context.copy += 1;
1332
+ log.debug("open: read_client_table: corrupt copy={}", .{copy});
1333
+ context.copy = copy + 1;
1275
1334
  superblock.read_client_table(context);
1276
1335
  }
1277
1336
  }
1278
1337
 
1338
+ fn repair(superblock: *SuperBlock, context: *Context) void {
1339
+ assert(context.caller == .open);
1340
+ assert(context.copy == null);
1341
+ assert(superblock.queue_head == context);
1342
+
1343
+ if (context.repairs.?.next()) |repair_copy| {
1344
+ context.copy = repair_copy;
1345
+ log.warn("repair: copy={}", .{repair_copy});
1346
+
1347
+ superblock.staging.* = superblock.working.*;
1348
+ superblock.write_manifest(context);
1349
+ } else {
1350
+ superblock.release(context);
1351
+ }
1352
+ }
1353
+
1279
1354
  fn acquire(superblock: *SuperBlock, context: *Context) void {
1280
1355
  if (superblock.queue_head) |head| {
1281
1356
  // There should be nothing else happening when we format() or open():
@@ -1298,11 +1373,10 @@ pub fn SuperBlockType(comptime Storage: type) type {
1298
1373
  superblock.queue_head = context;
1299
1374
  log.debug("{s}: started", .{@tagName(context.caller)});
1300
1375
 
1301
- switch (context.caller) {
1302
- .format => superblock.write_staging(context),
1303
- .open => superblock.read_working(context),
1304
- .checkpoint => superblock.write_staging(context),
1305
- .view_change => superblock.write_view_change(context),
1376
+ if (context.caller == .open) {
1377
+ superblock.read_working(context, .open);
1378
+ } else {
1379
+ superblock.write_staging(context);
1306
1380
  }
1307
1381
  }
1308
1382
  }
@@ -1312,19 +1386,24 @@ pub fn SuperBlockType(comptime Storage: type) type {
1312
1386
 
1313
1387
  log.debug("{s}: complete", .{@tagName(context.caller)});
1314
1388
 
1315
- if (context.caller == .open) {
1316
- assert(!superblock.opened);
1317
- superblock.opened = true;
1389
+ switch (context.caller) {
1390
+ .format => {},
1391
+ .open => {
1392
+ assert(!superblock.opened);
1393
+ superblock.opened = true;
1318
1394
 
1319
- if (superblock.working.manifest_size > 0) {
1320
- assert(superblock.manifest.count > 0);
1321
- }
1322
- if (superblock.working.free_set_size > @sizeOf(usize)) {
1323
- assert(superblock.free_set.count_acquired() > 0);
1324
- }
1325
- } else if (context.caller == .view_change) {
1326
- assert(meta.eql(superblock.working.vsr_state, context.vsr_state));
1327
- assert(meta.eql(superblock.staging.vsr_state, context.vsr_state));
1395
+ if (superblock.working.manifest_size > 0) {
1396
+ assert(superblock.manifest.count > 0);
1397
+ }
1398
+ // TODO Make the FreeSet encoding format not dependant on the word size.
1399
+ if (superblock.working.free_set_size > @sizeOf(usize)) {
1400
+ assert(superblock.free_set.count_acquired() > 0);
1401
+ }
1402
+ },
1403
+ .checkpoint, .view_change => {
1404
+ assert(meta.eql(superblock.staging.vsr_state, context.vsr_state.?));
1405
+ assert(meta.eql(superblock.working.vsr_state, context.vsr_state.?));
1406
+ },
1328
1407
  }
1329
1408
 
1330
1409
  const queue_tail = superblock.queue_tail;
@@ -1340,40 +1419,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
1340
1419
  assert(offset + size <= superblock.storage_offset + superblock.storage_size);
1341
1420
  }
1342
1421
 
1343
- fn offset_manifest(copy: u8, sequence: u64) u64 {
1344
- assert(copy >= starting_copy_for_sequence(sequence));
1345
- assert(copy <= stopping_copy_for_sequence(sequence));
1346
-
1347
- return superblock_size * copy + @sizeOf(SuperBlockSector);
1348
- }
1349
-
1350
- fn offset_free_set(copy: u8, sequence: u64) u64 {
1351
- assert(copy >= starting_copy_for_sequence(sequence));
1352
- assert(copy <= stopping_copy_for_sequence(sequence));
1353
-
1354
- return superblock_size * copy + @sizeOf(SuperBlockSector) +
1355
- superblock_trailer_manifest_size_max;
1356
- }
1357
-
1358
- fn offset_client_table(copy: u8, sequence: u64) u64 {
1359
- assert(copy >= starting_copy_for_sequence(sequence));
1360
- assert(copy <= stopping_copy_for_sequence(sequence));
1361
-
1362
- return superblock_size * copy + @sizeOf(SuperBlockSector) +
1363
- superblock_trailer_manifest_size_max +
1364
- superblock_trailer_free_set_size_max;
1365
- }
1366
-
1367
- /// Returns the first copy index (inclusive) to be written for a sequence number.
1368
- fn starting_copy_for_sequence(sequence: u64) u8 {
1369
- return config.superblock_copies * @intCast(u8, sequence % 2);
1370
- }
1371
-
1372
- /// Returns the last copy index (inclusive) to be written for a sequence number.
1373
- fn stopping_copy_for_sequence(sequence: u64) u8 {
1374
- return starting_copy_for_sequence(sequence) + config.superblock_copies - 1;
1375
- }
1376
-
1377
1422
  /// We use flexible quorums for even quorums with write quorum > read quorum, for example:
1378
1423
  /// * When writing, we must verify that at least 3/4 copies were written.
1379
1424
  /// * At startup, we must verify that at least 2/4 copies were read.
@@ -1404,217 +1449,25 @@ pub fn SuperBlockType(comptime Storage: type) type {
1404
1449
  };
1405
1450
  }
1406
1451
 
1407
- const Quorums = struct {
1408
- const Quorum = struct {
1409
- sector: *const SuperBlockSector,
1410
- count: QuorumCount = QuorumCount.initEmpty(),
1411
- valid: bool = false,
1412
- };
1413
-
1414
- const QuorumCount = std.StaticBitSet(superblock_copies_max);
1415
-
1416
- array: [superblock_copies_max]Quorum = undefined,
1417
- count: u8 = 0,
1418
-
1419
- pub const Error = error{
1420
- NotFound,
1421
- QuorumLost,
1422
- ParentNotFound,
1423
- ParentQuorumLost,
1424
- SequenceNotMonotonic,
1425
- VSRStateNotMonotonic,
1426
- };
1427
-
1428
- /// Returns the working superblock according to the quorum with the highest sequence number.
1429
- /// Verifies that the highest quorum is connected, that the previous quorum was not lost.
1430
- /// i.e. Both the working and previous quorum must be valid and intact and connected.
1431
- /// Otherwise, we might regress to a previous working superblock.
1432
- pub fn working(
1433
- quorums: *Quorums,
1434
- copies: []SuperBlockSector,
1435
- threshold: u8,
1436
- ) Error!*const SuperBlockSector {
1437
- assert(copies.len == superblock_copies_max);
1438
- assert(threshold >= 2 and threshold <= 5);
1439
-
1440
- quorums.array = undefined;
1441
- quorums.count = 0;
1442
-
1443
- for (copies) |*copy, index| quorums.count_copy(copy, index, threshold);
1444
-
1445
- std.sort.sort(Quorum, quorums.slice(), {}, sort_priority_descending);
1446
-
1447
- for (quorums.slice()) |quorum| {
1448
- if (quorum.count.count() == config.superblock_copies) {
1449
- log.debug("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
1450
- quorum.sector.checksum,
1451
- quorum.sector.parent,
1452
- quorum.sector.sequence,
1453
- quorum.count.count(),
1454
- quorum.valid,
1455
- });
1456
- } else {
1457
- log.err("quorum: checksum={x} parent={x} sequence={} count={} valid={}", .{
1458
- quorum.sector.checksum,
1459
- quorum.sector.parent,
1460
- quorum.sector.sequence,
1461
- quorum.count.count(),
1462
- quorum.valid,
1463
- });
1464
- }
1465
- }
1466
-
1467
- // No working copies of any sequence number exist in the superblock storage zone at all.
1468
- if (quorums.slice().len == 0) return error.NotFound;
1469
-
1470
- // At least one copy or quorum exists.
1471
- const b = quorums.slice()[0];
1472
-
1473
- // Verify that the remaining quorums are correctly sorted:
1474
- for (quorums.slice()[1..]) |a| {
1475
- assert(sort_priority_descending({}, b, a));
1476
- assert(a.sector.magic == .superblock);
1477
- assert(a.sector.valid_checksum());
1478
- }
1479
-
1480
- // Even the best copy with the most quorum still has inadequate quorum.
1481
- if (!b.valid) return error.QuorumLost;
1482
-
1483
- // The superblock is only partially formatted, not all copies were written.
1484
- if (b.sector.sequence < 2) return error.NotFound;
1485
-
1486
- // Verify that the parent copy exists:
1487
- for (quorums.slice()[1..]) |a| {
1488
- if (a.sector.cluster != b.sector.cluster) {
1489
- log.err("superblock copy={} has cluster={} instead of {}", .{
1490
- a.sector.copy,
1491
- a.sector.cluster,
1492
- b.sector.cluster,
1493
- });
1494
- } else if (a.sector.replica != b.sector.replica) {
1495
- log.err("superblock copy={} has replica={} instead of {}", .{
1496
- a.sector.copy,
1497
- a.sector.replica,
1498
- b.sector.replica,
1499
- });
1500
- } else if (a.sector.checksum == b.sector.parent) {
1501
- assert(a.sector.checksum != b.sector.checksum);
1502
- assert(a.sector.cluster == b.sector.cluster);
1503
- assert(a.sector.replica == b.sector.replica);
1504
-
1505
- if (!a.valid) {
1506
- return error.ParentQuorumLost;
1507
- } else if (a.sector.sequence >= b.sector.sequence) {
1508
- return error.SequenceNotMonotonic;
1509
- } else if (a.sector.sequence % 2 == b.sector.sequence % 2) {
1510
- // The parent must reside in the alternate copy to guarantee that we are able to
1511
- // detect when the working quorum is lost.
1512
- return error.SequenceNotMonotonic;
1513
- } else if (!a.sector.vsr_state.monotonic(b.sector.vsr_state)) {
1514
- return error.VSRStateNotMonotonic;
1515
- } else {
1516
- assert(b.sector.magic == .superblock);
1517
- assert(b.sector.valid_checksum());
1518
-
1519
- return b.sector;
1520
- }
1521
- }
1522
- }
1523
-
1524
- return error.ParentNotFound;
1452
+ pub const Layout = struct {
1453
+ pub fn offset_sector(copy: u8) u64 {
1454
+ assert(copy < config.superblock_copies);
1455
+ return superblock_copy_size * @as(u64, copy);
1525
1456
  }
1526
1457
 
1527
- fn count_copy(
1528
- quorums: *Quorums,
1529
- copy: *const SuperBlockSector,
1530
- index: usize,
1531
- threshold: u8,
1532
- ) void {
1533
- assert(index < superblock_copies_max);
1534
- assert(threshold >= 2 and threshold <= 5);
1535
-
1536
- if (!copy.valid_checksum()) {
1537
- log.debug("copy: {}/{}: invalid checksum", .{ index, superblock_copies_max });
1538
- return;
1539
- }
1540
-
1541
- if (copy.magic != .superblock) {
1542
- log.debug("copy: {}/{}: not a superblock", .{ index, superblock_copies_max });
1543
- return;
1544
- }
1545
-
1546
- if (copy.copy == index) {
1547
- log.debug("copy: {}/{}: checksum={x} parent={x} sequence={}", .{
1548
- index,
1549
- superblock_copies_max,
1550
- copy.checksum,
1551
- copy.parent,
1552
- copy.sequence,
1553
- });
1554
- } else {
1555
- // If our read was misdirected, we definitely still want to count the copy.
1556
- // We must just be careful to count it idempotently.
1557
- log.err(
1558
- "copy: {}/{}: checksum={x} parent={x} sequence={} misdirected from copy={}",
1559
- .{
1560
- index,
1561
- superblock_copies_max,
1562
- copy.checksum,
1563
- copy.parent,
1564
- copy.sequence,
1565
- copy.copy,
1566
- },
1567
- );
1568
- }
1569
-
1570
- var quorum = quorums.find_or_insert_quorum_for_copy(copy);
1571
- assert(quorum.sector.checksum == copy.checksum);
1572
- assert(quorum.sector.equal(copy));
1573
-
1574
- quorum.count.set(copy.copy);
1575
- assert(quorum.count.isSet(copy.copy));
1576
-
1577
- // In the worst case, all copies may contain divergent forks of the same sequence.
1578
- // However, this should not happen for the same checksum.
1579
- assert(quorum.count.count() <= config.superblock_copies);
1580
-
1581
- quorum.valid = quorum.count.count() >= threshold;
1582
- }
1583
-
1584
- fn find_or_insert_quorum_for_copy(quorums: *Quorums, copy: *const SuperBlockSector) *Quorum {
1585
- assert(copy.magic == .superblock);
1586
- assert(copy.valid_checksum());
1587
-
1588
- for (quorums.array[0..quorums.count]) |*quorum| {
1589
- if (copy.checksum == quorum.sector.checksum) return quorum;
1590
- } else {
1591
- quorums.array[quorums.count] = Quorum{ .sector = copy };
1592
- quorums.count += 1;
1593
-
1594
- return &quorums.array[quorums.count - 1];
1595
- }
1458
+ pub fn offset_manifest(copy: u8) u64 {
1459
+ assert(copy < config.superblock_copies);
1460
+ return offset_sector(copy) + @sizeOf(SuperBlockSector);
1596
1461
  }
1597
1462
 
1598
- fn slice(quorums: *Quorums) []Quorum {
1599
- return quorums.array[0..quorums.count];
1463
+ pub fn offset_free_set(copy: u8) u64 {
1464
+ assert(copy < config.superblock_copies);
1465
+ return offset_manifest(copy) + superblock_trailer_manifest_size_max;
1600
1466
  }
1601
1467
 
1602
- fn sort_priority_descending(_: void, a: Quorum, b: Quorum) bool {
1603
- assert(a.sector.checksum != b.sector.checksum);
1604
- assert(a.sector.magic == .superblock);
1605
- assert(b.sector.magic == .superblock);
1606
-
1607
- if (a.valid and !b.valid) return true;
1608
- if (b.valid and !a.valid) return false;
1609
-
1610
- if (a.sector.sequence > b.sector.sequence) return true;
1611
- if (b.sector.sequence > a.sector.sequence) return false;
1612
-
1613
- if (a.count.count() > b.count.count()) return true;
1614
- if (b.count.count() > a.count.count()) return false;
1615
-
1616
- // The sort order must be stable and deterministic:
1617
- return a.sector.checksum > b.sector.checksum;
1468
+ pub fn offset_client_table(copy: u8) u64 {
1469
+ assert(copy < config.superblock_copies);
1470
+ return offset_free_set(copy) + superblock_trailer_free_set_size_max;
1618
1471
  }
1619
1472
  };
1620
1473
 
@@ -1633,111 +1486,3 @@ test "SuperBlockSector" {
1633
1486
  a.replica += 1;
1634
1487
  try expect(!a.valid_checksum());
1635
1488
  }
1636
-
1637
- // TODO Add unit tests for Quorums.
1638
- // TODO Test invariants and transitions across TestRunner functions.
1639
- // TODO Add a pristine in-memory test storage shim (we currently use real disk).
1640
- const TestStorage = @import("../storage.zig").Storage;
1641
- const TestSuperBlock = SuperBlockType(TestStorage);
1642
-
1643
- const TestRunner = struct {
1644
- superblock: *TestSuperBlock,
1645
- context_format: TestSuperBlock.Context = undefined,
1646
- context_open: TestSuperBlock.Context = undefined,
1647
- context_checkpoint: TestSuperBlock.Context = undefined,
1648
- context_view_change: TestSuperBlock.Context = undefined,
1649
- pending: usize = 0,
1650
-
1651
- fn format(runner: *TestRunner, options: TestSuperBlock.FormatOptions) void {
1652
- runner.pending += 1;
1653
- runner.superblock.format(format_callback, &runner.context_format, options);
1654
- }
1655
-
1656
- fn format_callback(context: *TestSuperBlock.Context) void {
1657
- const runner = @fieldParentPtr(TestRunner, "context_format", context);
1658
- runner.pending -= 1;
1659
- runner.open();
1660
- }
1661
-
1662
- fn open(runner: *TestRunner) void {
1663
- runner.pending += 1;
1664
- runner.superblock.open(open_callback, &runner.context_open);
1665
- }
1666
-
1667
- fn open_callback(context: *TestSuperBlock.Context) void {
1668
- const runner = @fieldParentPtr(TestRunner, "context_open", context);
1669
- runner.pending -= 1;
1670
- runner.checkpoint();
1671
- runner.view_change();
1672
- }
1673
-
1674
- fn view_change(runner: *TestRunner) void {
1675
- runner.pending += 1;
1676
- runner.superblock.view_change(
1677
- view_change_callback,
1678
- &runner.context_view_change,
1679
- .{
1680
- .commit_min = runner.superblock.working.vsr_state.commit_min + 1,
1681
- .commit_max = runner.superblock.working.vsr_state.commit_max + 2,
1682
- .view_normal = runner.superblock.working.vsr_state.view_normal + 3,
1683
- .view = runner.superblock.working.vsr_state.view + 4,
1684
- },
1685
- );
1686
- }
1687
-
1688
- fn view_change_callback(context: *TestSuperBlock.Context) void {
1689
- const runner = @fieldParentPtr(TestRunner, "context_view_change", context);
1690
- runner.pending -= 1;
1691
- runner.checkpoint();
1692
- }
1693
-
1694
- fn checkpoint(runner: *TestRunner) void {
1695
- runner.pending += 1;
1696
- runner.superblock.checkpoint(checkpoint_callback, &runner.context_checkpoint);
1697
- }
1698
-
1699
- fn checkpoint_callback(context: *TestSuperBlock.Context) void {
1700
- const runner = @fieldParentPtr(TestRunner, "context_checkpoint", context);
1701
- runner.pending -= 1;
1702
- }
1703
- };
1704
-
1705
- pub fn main() !void {
1706
- const testing = std.testing;
1707
- const allocator = testing.allocator;
1708
-
1709
- const IO = @import("../io.zig").IO;
1710
- const Storage = @import("../storage.zig").Storage;
1711
-
1712
- const dir_path = ".";
1713
- const dir_fd = os.openZ(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0) catch |err| {
1714
- std.debug.print("failed to open directory '{s}': {}", .{ dir_path, err });
1715
- return;
1716
- };
1717
-
1718
- const cluster = 32;
1719
- const replica = 4;
1720
- const size_max = 512 * 1024 * 1024;
1721
-
1722
- const storage_fd = try Storage.open(dir_fd, "test_superblock", size_max, true);
1723
- defer std.fs.cwd().deleteFile("test_superblock") catch {};
1724
-
1725
- var io = try IO.init(128, 0);
1726
- defer io.deinit();
1727
-
1728
- var storage = try Storage.init(&io, size_max, storage_fd);
1729
- defer storage.deinit();
1730
-
1731
- var superblock = try TestSuperBlock.init(allocator, &storage);
1732
- defer superblock.deinit(allocator);
1733
-
1734
- var runner = TestRunner{ .superblock = &superblock };
1735
-
1736
- runner.format(.{
1737
- .cluster = cluster,
1738
- .replica = replica,
1739
- .size_max = size_max,
1740
- });
1741
-
1742
- while (runner.pending > 0) try io.run_for_ns(100);
1743
- }