tigerbeetle-node 0.11.8 → 0.11.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/package.json +4 -3
  3. package/scripts/build_lib.sh +29 -0
  4. package/src/node.zig +1 -1
  5. package/src/tigerbeetle/scripts/validate_docs.sh +7 -1
  6. package/src/tigerbeetle/src/benchmark.zig +3 -3
  7. package/src/tigerbeetle/src/config.zig +29 -16
  8. package/src/tigerbeetle/src/constants.zig +30 -9
  9. package/src/tigerbeetle/src/ewah.zig +5 -5
  10. package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
  11. package/src/tigerbeetle/src/lsm/binary_search.zig +1 -1
  12. package/src/tigerbeetle/src/lsm/bloom_filter.zig +1 -1
  13. package/src/tigerbeetle/src/lsm/compaction.zig +34 -21
  14. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +85 -103
  15. package/src/tigerbeetle/src/lsm/grid.zig +19 -13
  16. package/src/tigerbeetle/src/lsm/manifest_log.zig +8 -10
  17. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +12 -8
  18. package/src/tigerbeetle/src/lsm/merge_iterator.zig +1 -1
  19. package/src/tigerbeetle/src/lsm/segmented_array.zig +17 -17
  20. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +1 -1
  21. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +1 -1
  22. package/src/tigerbeetle/src/lsm/table.zig +8 -20
  23. package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
  24. package/src/tigerbeetle/src/lsm/table_iterator.zig +3 -3
  25. package/src/tigerbeetle/src/lsm/table_mutable.zig +14 -2
  26. package/src/tigerbeetle/src/lsm/tree.zig +31 -5
  27. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +86 -114
  28. package/src/tigerbeetle/src/message_bus.zig +4 -4
  29. package/src/tigerbeetle/src/message_pool.zig +7 -10
  30. package/src/tigerbeetle/src/ring_buffer.zig +22 -12
  31. package/src/tigerbeetle/src/simulator.zig +360 -214
  32. package/src/tigerbeetle/src/state_machine/auditor.zig +5 -5
  33. package/src/tigerbeetle/src/state_machine/workload.zig +3 -3
  34. package/src/tigerbeetle/src/state_machine.zig +190 -178
  35. package/src/tigerbeetle/src/{util.zig → stdx.zig} +2 -0
  36. package/src/tigerbeetle/src/storage.zig +13 -6
  37. package/src/tigerbeetle/src/{test → testing/cluster}/message_bus.zig +3 -3
  38. package/src/tigerbeetle/src/{test → testing/cluster}/network.zig +46 -22
  39. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +169 -0
  40. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +202 -0
  41. package/src/tigerbeetle/src/testing/cluster.zig +537 -0
  42. package/src/tigerbeetle/src/{test → testing}/fuzz.zig +0 -0
  43. package/src/tigerbeetle/src/testing/hash_log.zig +66 -0
  44. package/src/tigerbeetle/src/{test → testing}/id.zig +0 -0
  45. package/src/tigerbeetle/src/testing/packet_simulator.zig +365 -0
  46. package/src/tigerbeetle/src/{test → testing}/priority_queue.zig +1 -1
  47. package/src/tigerbeetle/src/testing/reply_sequence.zig +139 -0
  48. package/src/tigerbeetle/src/{test → testing}/state_machine.zig +3 -1
  49. package/src/tigerbeetle/src/testing/storage.zig +754 -0
  50. package/src/tigerbeetle/src/{test → testing}/table.zig +21 -0
  51. package/src/tigerbeetle/src/{test → testing}/time.zig +0 -0
  52. package/src/tigerbeetle/src/tigerbeetle.zig +2 -0
  53. package/src/tigerbeetle/src/tracer.zig +3 -3
  54. package/src/tigerbeetle/src/unit_tests.zig +4 -4
  55. package/src/tigerbeetle/src/vopr.zig +2 -2
  56. package/src/tigerbeetle/src/vsr/client.zig +5 -2
  57. package/src/tigerbeetle/src/vsr/clock.zig +93 -53
  58. package/src/tigerbeetle/src/vsr/journal.zig +29 -14
  59. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +2 -2
  60. package/src/tigerbeetle/src/vsr/replica.zig +1383 -774
  61. package/src/tigerbeetle/src/vsr/replica_format.zig +2 -2
  62. package/src/tigerbeetle/src/vsr/superblock.zig +59 -43
  63. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -7
  64. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +1 -1
  65. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
  66. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +15 -7
  67. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +38 -19
  68. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +1 -1
  69. package/src/tigerbeetle/src/vsr.zig +6 -4
  70. package/src/tigerbeetle/src/demo.zig +0 -132
  71. package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
  72. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
  73. package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -37
  74. package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
  75. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
  76. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
  77. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
  78. package/src/tigerbeetle/src/test/cluster.zig +0 -352
  79. package/src/tigerbeetle/src/test/conductor.zig +0 -366
  80. package/src/tigerbeetle/src/test/packet_simulator.zig +0 -398
  81. package/src/tigerbeetle/src/test/state_checker.zig +0 -169
  82. package/src/tigerbeetle/src/test/storage.zig +0 -864
  83. package/src/tigerbeetle/src/test/storage_checker.zig +0 -204
@@ -1,864 +0,0 @@
1
- //! In-memory storage, with simulated faults and latency.
2
- //!
3
- //!
4
- //! Fault Injection
5
- //!
6
- //! Storage injects faults that the cluster can (i.e. should be able to) recover from.
7
- //! Each zone can tolerate a different pattern of faults.
8
- //!
9
- //! - superblock:
10
- //! - One read/write fault is permitted per area (section, manifest, …).
11
- //! - An additional fault is permitted at the target of a pending write during a crash.
12
- //!
13
- //! - wal_headers, wal_prepares:
14
- //! - Read/write faults are distributed between replicas according to FaultyAreas, to ensure
15
- //! that at least one replica will have a valid copy to help others repair.
16
- //! (See: generate_faulty_wal_areas()).
17
- //! - When a replica crashes, it may fault the WAL outside of FaultyAreas.
18
- //! - When replica_count=1, its WAL can only be corrupted by a crash, never a read/write.
19
- //! (When replica_count=1, there are no other replicas to assist with repair).
20
- //!
21
- //! - grid: (TODO: Enable grid faults when grid repair is implemented).
22
- //!
23
- const std = @import("std");
24
- const assert = std.debug.assert;
25
- const math = std.math;
26
- const mem = std.mem;
27
-
28
- const FIFO = @import("../fifo.zig").FIFO;
29
- const constants = @import("../constants.zig");
30
- const vsr = @import("../vsr.zig");
31
- const superblock = @import("../vsr/superblock.zig");
32
- const BlockType = @import("../lsm/grid.zig").BlockType;
33
- const util = @import("../util.zig");
34
-
35
- const log = std.log.scoped(.storage);
36
-
37
- // TODOs:
38
- // less than a majority of replicas may have corruption
39
- // have an option to enable/disable the following corruption types:
40
- // bitrot
41
- // misdirected read/write
42
- // corrupt sector
43
- // latent sector error
44
- // - emulate by zeroing sector, as this is how we handle this in the real Storage implementation
45
- // - likely that surrounding sectors also corrupt
46
- // - likely that stuff written at the same time is also corrupt even if written to a far away sector
47
- pub const Storage = struct {
48
- /// Options for fault injection during fuzz testing
49
- pub const Options = struct {
50
- /// Seed for the storage PRNG.
51
- seed: u64 = 0,
52
-
53
- /// (Only used for logging.)
54
- replica_index: ?u8 = null,
55
-
56
- /// Minimum number of ticks it may take to read data.
57
- read_latency_min: u64,
58
- /// Average number of ticks it may take to read data. Must be >= read_latency_min.
59
- read_latency_mean: u64,
60
- /// Minimum number of ticks it may take to write data.
61
- write_latency_min: u64,
62
- /// Average number of ticks it may take to write data. Must be >= write_latency_min.
63
- write_latency_mean: u64,
64
-
65
- /// Chance out of 100 that a read will corrupt a sector, if the target memory is within
66
- /// a faulty area of this replica.
67
- read_fault_probability: u8 = 0,
68
- /// Chance out of 100 that a write will corrupt a sector, if the target memory is within
69
- /// a faulty area of this replica.
70
- write_fault_probability: u8 = 0,
71
- /// Chance out of 100 that a crash will corrupt a sector of a pending write's target,
72
- /// if the target memory is within a faulty area of this replica.
73
- crash_fault_probability: u8 = 0,
74
-
75
- /// Enable/disable SuperBlock zone faults.
76
- faulty_superblock: bool = false,
77
-
78
- // In the WAL, we can't allow storage faults for the same message in a majority of
79
- // the replicas as that would make recovery impossible. Instead, we only
80
- // allow faults in certain areas which differ between replicas.
81
- faulty_wal_areas: ?FaultyAreas = null,
82
- };
83
-
84
- /// See usage in Journal.write_sectors() for details.
85
- /// TODO: allow testing in both modes.
86
- pub const synchronicity: enum {
87
- always_synchronous,
88
- always_asynchronous,
89
- } = .always_asynchronous;
90
-
91
- pub const Read = struct {
92
- callback: fn (read: *Storage.Read) void,
93
- buffer: []u8,
94
- zone: vsr.Zone,
95
- /// Relative offset within the zone.
96
- offset: u64,
97
- /// Tick at which this read is considered "completed" and the callback should be called.
98
- done_at_tick: u64,
99
-
100
- fn less_than(context: void, a: *Read, b: *Read) math.Order {
101
- _ = context;
102
-
103
- return math.order(a.done_at_tick, b.done_at_tick);
104
- }
105
- };
106
-
107
- pub const Write = struct {
108
- callback: fn (write: *Storage.Write) void,
109
- buffer: []const u8,
110
- zone: vsr.Zone,
111
- /// Relative offset within the zone.
112
- offset: u64,
113
- /// Tick at which this write is considered "completed" and the callback should be called.
114
- done_at_tick: u64,
115
-
116
- fn less_than(context: void, a: *Write, b: *Write) math.Order {
117
- _ = context;
118
-
119
- return math.order(a.done_at_tick, b.done_at_tick);
120
- }
121
- };
122
-
123
- pub const NextTick = struct {
124
- next: ?*NextTick = null,
125
- callback: fn (next_tick: *NextTick) void,
126
- };
127
-
128
- /// Faulty areas are always sized to message_size_max
129
- /// If the faulty areas of all replicas are superimposed, the padding between them is always message_size_max.
130
- /// For a single replica, the padding between faulty areas depends on the number of other replicas.
131
- pub const FaultyAreas = struct {
132
- first_offset: u64,
133
- period: u64,
134
- };
135
-
136
- allocator: mem.Allocator,
137
-
138
- memory: []align(constants.sector_size) u8,
139
- /// Set bits correspond to sectors that have ever been written to.
140
- memory_written: std.DynamicBitSetUnmanaged,
141
- /// Set bits correspond to faulty sectors. The underlying sectors of `memory` is left clean.
142
- faults: std.DynamicBitSetUnmanaged,
143
-
144
- size: u64,
145
-
146
- options: Options,
147
- prng: std.rand.DefaultPrng,
148
-
149
- /// Whether to enable faults (when false, this supersedes `faulty_wal_areas`).
150
- /// This is used to disable faults during the replica's first startup.
151
- faulty: bool = true,
152
-
153
- reads: std.PriorityQueue(*Storage.Read, void, Storage.Read.less_than),
154
- writes: std.PriorityQueue(*Storage.Write, void, Storage.Write.less_than),
155
-
156
- ticks: u64 = 0,
157
- next_tick_queue: FIFO(NextTick) = .{},
158
-
159
- pub fn init(allocator: mem.Allocator, size: u64, options: Storage.Options) !Storage {
160
- assert(options.write_latency_mean >= options.write_latency_min);
161
- assert(options.read_latency_mean >= options.read_latency_min);
162
-
163
- const memory = try allocator.allocAdvanced(u8, constants.sector_size, size, .exact);
164
- errdefer allocator.free(memory);
165
- // TODO: random data
166
- mem.set(u8, memory, 0);
167
-
168
- var memory_written = try std.DynamicBitSetUnmanaged.initEmpty(
169
- allocator,
170
- @divExact(size, constants.sector_size),
171
- );
172
- errdefer memory_written.deinit(allocator);
173
-
174
- var faults = try std.DynamicBitSetUnmanaged.initEmpty(
175
- allocator,
176
- @divExact(size, constants.sector_size),
177
- );
178
- errdefer faults.deinit(allocator);
179
-
180
- var reads = std.PriorityQueue(*Storage.Read, void, Storage.Read.less_than).init(allocator, {});
181
- errdefer reads.deinit();
182
- try reads.ensureTotalCapacity(constants.iops_read_max);
183
-
184
- var writes = std.PriorityQueue(*Storage.Write, void, Storage.Write.less_than).init(allocator, {});
185
- errdefer writes.deinit();
186
- try writes.ensureTotalCapacity(constants.iops_write_max);
187
-
188
- return Storage{
189
- .allocator = allocator,
190
- .memory = memory,
191
- .memory_written = memory_written,
192
- .faults = faults,
193
- .size = size,
194
- .options = options,
195
- .prng = std.rand.DefaultPrng.init(options.seed),
196
- .reads = reads,
197
- .writes = writes,
198
- };
199
- }
200
-
201
- /// Cancel any currently in-progress reads/writes.
202
- /// Corrupt the target sectors of any in-progress writes.
203
- pub fn reset(storage: *Storage) void {
204
- while (storage.writes.peek()) |_| {
205
- const write = storage.writes.remove();
206
- if (switch (write.zone) {
207
- .superblock => !storage.options.faulty_superblock,
208
- // On crash, the WAL may be corrupted outside of the FaultyAreas.
209
- .wal_headers, .wal_prepares => storage.options.faulty_wal_areas == null,
210
- // TODO Enable fault injection for grid.
211
- .grid => true,
212
- }) continue;
213
-
214
- if (!storage.x_in_100(storage.options.crash_fault_probability)) continue;
215
-
216
- const sector_min = @divExact(write.zone.offset(write.offset), constants.sector_size);
217
- const sector_max = sector_min + @divExact(write.buffer.len, constants.sector_size);
218
-
219
- // Randomly corrupt one of the faulty sectors the operation targeted.
220
- // TODO: inject more realistic and varied storage faults as described above.
221
- storage.fault_sector(
222
- write.zone,
223
- storage.random_uint_between(usize, sector_min, sector_max),
224
- );
225
- }
226
- assert(storage.writes.len == 0);
227
-
228
- storage.reads.len = 0;
229
- }
230
-
231
- pub fn deinit(storage: *Storage, allocator: mem.Allocator) void {
232
- assert(storage.next_tick_queue.empty());
233
- allocator.free(storage.memory);
234
- storage.memory_written.deinit(allocator);
235
- storage.faults.deinit(allocator);
236
- storage.reads.deinit();
237
- storage.writes.deinit();
238
- }
239
-
240
- /// Returns the number of bytes that have been written to, assuming that (the simulated)
241
- /// `fallocate()` creates a sparse file.
242
- pub fn size_used(storage: *const Storage) usize {
243
- return storage.memory_written.count() * constants.sector_size;
244
- }
245
-
246
- /// Copy state from `origin` to `storage`:
247
- ///
248
- /// - ticks
249
- /// - memory
250
- /// - occupied memory
251
- /// - faulty sectors
252
- /// - reads in-progress
253
- /// - writes in-progress
254
- ///
255
- /// Both instances must have an identical size.
256
- pub fn copy(storage: *Storage, origin: *const Storage) void {
257
- assert(storage.size == origin.size);
258
-
259
- storage.ticks = origin.ticks;
260
- util.copy_disjoint(.exact, u8, storage.memory, origin.memory);
261
- storage.memory_written.toggleSet(storage.memory_written);
262
- storage.memory_written.toggleSet(origin.memory_written);
263
- storage.faults.toggleSet(storage.faults);
264
- storage.faults.toggleSet(origin.faults);
265
-
266
- storage.reads.len = 0;
267
- for (origin.reads.items[0..origin.reads.len]) |read| {
268
- storage.reads.add(read) catch unreachable;
269
- }
270
-
271
- storage.writes.len = 0;
272
- for (origin.writes.items[0..origin.writes.len]) |write| {
273
- storage.writes.add(write) catch unreachable;
274
- }
275
- }
276
-
277
- pub fn tick(storage: *Storage) void {
278
- storage.ticks += 1;
279
-
280
- while (storage.reads.peek()) |read| {
281
- if (read.done_at_tick > storage.ticks) break;
282
- _ = storage.reads.remove();
283
- storage.read_sectors_finish(read);
284
- }
285
-
286
- while (storage.writes.peek()) |write| {
287
- if (write.done_at_tick > storage.ticks) break;
288
- _ = storage.writes.remove();
289
- storage.write_sectors_finish(write);
290
- }
291
-
292
- var queue = storage.next_tick_queue;
293
- storage.next_tick_queue = .{};
294
- while (queue.pop()) |next_tick| next_tick.callback(next_tick);
295
- }
296
-
297
- pub fn on_next_tick(
298
- storage: *Storage,
299
- callback: fn (next_tick: *Storage.NextTick) void,
300
- next_tick: *Storage.NextTick,
301
- ) void {
302
- next_tick.* = .{ .callback = callback };
303
- storage.next_tick_queue.push(next_tick);
304
- }
305
-
306
- /// * Verifies that the read fits within the target sector.
307
- /// * Verifies that the read targets sectors that have been written to.
308
- pub fn read_sectors(
309
- storage: *Storage,
310
- callback: fn (read: *Storage.Read) void,
311
- read: *Storage.Read,
312
- buffer: []u8,
313
- zone: vsr.Zone,
314
- offset_in_zone: u64,
315
- ) void {
316
- if (zone.size()) |zone_size| {
317
- assert(offset_in_zone + buffer.len <= zone_size);
318
- }
319
-
320
- const offset_in_storage = zone.offset(offset_in_zone);
321
- storage.verify_bounds_and_alignment(buffer, offset_in_storage);
322
-
323
- {
324
- const sector_min = @divExact(offset_in_storage, constants.sector_size);
325
- const sector_max = @divExact(offset_in_storage + buffer.len, constants.sector_size);
326
- var sector: usize = sector_min;
327
- while (sector < sector_max) : (sector += 1) {
328
- assert(storage.memory_written.isSet(sector));
329
- }
330
- }
331
-
332
- read.* = .{
333
- .callback = callback,
334
- .buffer = buffer,
335
- .zone = zone,
336
- .offset = offset_in_zone,
337
- .done_at_tick = storage.ticks + storage.read_latency(),
338
- };
339
-
340
- // We ensure the capacity is sufficient for constants.iops_read_max in init()
341
- storage.reads.add(read) catch unreachable;
342
- }
343
-
344
- fn read_sectors_finish(storage: *Storage, read: *Storage.Read) void {
345
- const offset_in_storage = read.zone.offset(read.offset);
346
- util.copy_disjoint(
347
- .exact,
348
- u8,
349
- read.buffer,
350
- storage.memory[offset_in_storage..][0..read.buffer.len],
351
- );
352
-
353
- if (storage.x_in_100(storage.options.read_fault_probability)) {
354
- storage.fault_faulty_sectors(read.zone, read.offset, read.buffer.len);
355
- }
356
-
357
- if (storage.faulty) {
358
- // Corrupt faulty sectors.
359
- const sector_min = @divExact(offset_in_storage, constants.sector_size);
360
- const sector_max = @divExact(offset_in_storage + read.buffer.len, constants.sector_size);
361
- var sector: usize = sector_min;
362
- while (sector < sector_max) : (sector += 1) {
363
- if (storage.faults.isSet(sector)) {
364
- const faulty_sector_offset = (sector - sector_min) * constants.sector_size;
365
- const faulty_sector_bytes = read.buffer[faulty_sector_offset..][0..constants.sector_size];
366
- storage.prng.random().bytes(faulty_sector_bytes);
367
- }
368
- }
369
- }
370
-
371
- read.callback(read);
372
- }
373
-
374
- pub fn write_sectors(
375
- storage: *Storage,
376
- callback: fn (write: *Storage.Write) void,
377
- write: *Storage.Write,
378
- buffer: []const u8,
379
- zone: vsr.Zone,
380
- offset_in_zone: u64,
381
- ) void {
382
- if (zone.size()) |zone_size| {
383
- assert(offset_in_zone + buffer.len <= zone_size);
384
- }
385
-
386
- storage.verify_bounds_and_alignment(buffer, zone.offset(offset_in_zone));
387
-
388
- // Verify that there are no concurrent overlapping writes.
389
- var iterator = storage.writes.iterator();
390
- while (iterator.next()) |other| {
391
- if (other.zone != zone) continue;
392
- assert(offset_in_zone + buffer.len <= other.offset or
393
- other.offset + other.buffer.len <= offset_in_zone);
394
- }
395
-
396
- switch (zone) {
397
- .superblock => storage.verify_write_superblock(buffer, offset_in_zone),
398
- .wal_headers => {
399
- for (std.mem.bytesAsSlice(vsr.Header, buffer)) |header| {
400
- storage.verify_write_wal_header(header);
401
- }
402
- },
403
- else => {},
404
- }
405
-
406
- write.* = .{
407
- .callback = callback,
408
- .buffer = buffer,
409
- .zone = zone,
410
- .offset = offset_in_zone,
411
- .done_at_tick = storage.ticks + storage.write_latency(),
412
- };
413
-
414
- // We ensure the capacity is sufficient for constants.iops_write_max in init()
415
- storage.writes.add(write) catch unreachable;
416
- }
417
-
418
- fn write_sectors_finish(storage: *Storage, write: *Storage.Write) void {
419
- const offset_in_storage = write.zone.offset(write.offset);
420
- util.copy_disjoint(
421
- .exact,
422
- u8,
423
- storage.memory[offset_in_storage..][0..write.buffer.len],
424
- write.buffer,
425
- );
426
-
427
- const sector_min = @divExact(offset_in_storage, constants.sector_size);
428
- const sector_max = @divExact(offset_in_storage + write.buffer.len, constants.sector_size);
429
- var sector: usize = sector_min;
430
- while (sector < sector_max) : (sector += 1) {
431
- storage.faults.unset(sector);
432
- storage.memory_written.set(sector);
433
- }
434
-
435
- if (storage.x_in_100(storage.options.write_fault_probability)) {
436
- storage.fault_faulty_sectors(write.zone, write.offset, write.buffer.len);
437
- }
438
-
439
- write.callback(write);
440
- }
441
-
442
- fn read_latency(storage: *Storage) u64 {
443
- return storage.latency(storage.options.read_latency_min, storage.options.read_latency_mean);
444
- }
445
-
446
- fn write_latency(storage: *Storage) u64 {
447
- return storage.latency(storage.options.write_latency_min, storage.options.write_latency_mean);
448
- }
449
-
450
- fn latency(storage: *Storage, min: u64, mean: u64) u64 {
451
- return min + @floatToInt(u64, @intToFloat(f64, mean - min) * storage.prng.random().floatExp(f64));
452
- }
453
-
454
- /// Return true with probability x/100.
455
- fn x_in_100(storage: *Storage, x: u8) bool {
456
- assert(x <= 100);
457
- return x > storage.prng.random().uintLessThan(u8, 100);
458
- }
459
-
460
- fn random_uint_between(storage: *Storage, comptime T: type, min: T, max: T) T {
461
- return min + storage.prng.random().uintLessThan(T, max - min);
462
- }
463
-
464
- /// The return value is a slice into the provided out array.
465
- pub fn generate_faulty_wal_areas(
466
- prng: std.rand.Random,
467
- size: u64,
468
- replica_count: u8,
469
- out: *[constants.replicas_max]FaultyAreas,
470
- ) []FaultyAreas {
471
- comptime assert(constants.message_size_max % constants.sector_size == 0);
472
- const message_size_max = constants.message_size_max;
473
-
474
- // We need to ensure there is message_size_max fault-free padding
475
- // between faulty areas of memory so that a single message
476
- // cannot straddle the corruptable areas of a majority of replicas.
477
- comptime assert(constants.replicas_max == 6);
478
- switch (replica_count) {
479
- 1 => {
480
- // If there is only one replica in the cluster, storage faults are not recoverable.
481
- out[0] = .{ .first_offset = size, .period = 1 };
482
- },
483
- 2 => {
484
- // 0123456789
485
- // 0X X X
486
- // 1 X X X
487
- out[0] = .{ .first_offset = 0 * message_size_max, .period = 4 * message_size_max };
488
- out[1] = .{ .first_offset = 2 * message_size_max, .period = 4 * message_size_max };
489
- },
490
- 3 => {
491
- // 0123456789
492
- // 0X X
493
- // 1 X X
494
- // 2 X X
495
- out[0] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
496
- out[1] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
497
- out[2] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
498
- },
499
- 4 => {
500
- // 0123456789
501
- // 0X X X
502
- // 1X X X
503
- // 2 X X X
504
- // 3 X X X
505
- out[0] = .{ .first_offset = 0 * message_size_max, .period = 4 * message_size_max };
506
- out[1] = .{ .first_offset = 0 * message_size_max, .period = 4 * message_size_max };
507
- out[2] = .{ .first_offset = 2 * message_size_max, .period = 4 * message_size_max };
508
- out[3] = .{ .first_offset = 2 * message_size_max, .period = 4 * message_size_max };
509
- },
510
- 5 => {
511
- // 0123456789
512
- // 0X X
513
- // 1X X
514
- // 2 X X
515
- // 3 X X
516
- // 4 X X
517
- out[0] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
518
- out[1] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
519
- out[2] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
520
- out[3] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
521
- out[4] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
522
- },
523
- 6 => {
524
- // 0123456789
525
- // 0X X
526
- // 1X X
527
- // 2 X X
528
- // 3 X X
529
- // 4 X X
530
- // 5 X X
531
- out[0] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
532
- out[1] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
533
- out[2] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
534
- out[3] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
535
- out[4] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
536
- out[5] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
537
- },
538
- else => unreachable,
539
- }
540
-
541
- {
542
- // Allow at most `f` faulty replicas to ensure the view change can succeed.
543
- // TODO Allow more than `f` faulty replicas when the fault is to the right of the
544
- // highest known replica.op (and to the left of the last checkpointed op).
545
- const majority = @divFloor(replica_count, 2) + 1;
546
- const quorum_replication = std.math.min(constants.quorum_replication_max, majority);
547
- const quorum_view_change = std.math.max(
548
- replica_count - quorum_replication + 1,
549
- majority,
550
- );
551
- var i: usize = quorum_view_change;
552
- while (i < replica_count) : (i += 1) {
553
- out[i].first_offset = size;
554
- }
555
- }
556
-
557
- prng.shuffle(FaultyAreas, out[0..replica_count]);
558
- return out[0..replica_count];
559
- }
560
-
561
- const SectorRange = struct {
562
- min: usize, // inclusive sector index
563
- max: usize, // exclusive sector index
564
- };
565
-
566
- /// Given an offset and size of a read/write, returns the range of any faulty sectors touched
567
- /// by the read/write.
568
- fn faulty_sectors(
569
- storage: *const Storage,
570
- zone: vsr.Zone,
571
- offset_in_zone: u64,
572
- size: u64,
573
- ) ?SectorRange {
574
- const offset_in_storage = zone.offset(offset_in_zone);
575
-
576
- if (zone == .superblock) {
577
- if (!storage.options.faulty_superblock) return null;
578
-
579
- const target_area = SuperBlockArea.from_offset(offset_in_zone);
580
- // This is the maximum number of faults per-area that can be safely injected on a read
581
- // or write to the superblock zone.
582
- //
583
- // For SuperBlockSector, checkpoint() and view_change() require 3/4 valid sectors (1
584
- // fault). Trailers are likewise 3/4 + 1 fault — consider if two faults were injected:
585
- // 1. `SuperBlock.checkpoint()` for sequence=6.
586
- // - write copy 0, corrupt manifest (fault_count=1)
587
- // - write copy 1, corrupt manifest (fault_count=2) !
588
- // 2. Crash. Recover.
589
- // 3. `SuperBlock.open()`. The highest valid quorum is sequence=6, but there is no
590
- // valid manifest.
591
- const fault_count_max = @divExact(constants.superblock_copies, 2) - 1;
592
- assert(fault_count_max >= 1);
593
-
594
- const fault_count = blk: {
595
- var fault_count: usize = 0;
596
- var copy_: u8 = 0;
597
- while (copy_ < constants.superblock_copies) : (copy_ += 1) {
598
- const copy_area = SuperBlockArea{ .group = target_area.group, .copy = copy_ };
599
- const copy_area_offset_zone = copy_area.to_offset();
600
- const copy_area_offset_storage = zone.offset(copy_area_offset_zone);
601
- const copy_area_sector = @divExact(
602
- copy_area_offset_storage,
603
- constants.sector_size,
604
- );
605
- fault_count += @boolToInt(storage.faults.isSet(copy_area_sector));
606
- }
607
- break :blk fault_count;
608
- };
609
-
610
- // fault_count may be slightly greater than fault_count_max due to faults added by
611
- // `Storage.reset()` (a simulated crash).
612
- assert(fault_count <= fault_count_max + 1);
613
- if (fault_count >= fault_count_max) return null;
614
-
615
- // Always fault the first sector of the read/write so that we can easily test
616
- // `storage.faults` to probe the current `fault_count`.
617
- const sector = @divExact(offset_in_storage, constants.sector_size);
618
- return SectorRange{
619
- .min = sector,
620
- .max = sector + 1,
621
- };
622
- }
623
-
624
- if (zone == .wal_headers or zone == .wal_prepares) {
625
- const faulty_wal_areas = storage.options.faulty_wal_areas orelse return null;
626
- const message_size_max = constants.message_size_max;
627
- const period = faulty_wal_areas.period;
628
-
629
- const offset_faulty =
630
- faulty_wal_areas.first_offset + @divFloor(offset_in_storage, period) * period;
631
-
632
- const offset_start = std.math.max(offset_in_storage, offset_faulty);
633
- const offset_end = std.math.min(
634
- offset_in_storage + size,
635
- offset_faulty + message_size_max,
636
- );
637
-
638
- // The read/write does not touch any faulty sectors.
639
- if (offset_start >= offset_end) return null;
640
-
641
- return SectorRange{
642
- .min = @divExact(offset_start, constants.sector_size),
643
- .max = @divExact(offset_end, constants.sector_size),
644
- };
645
- }
646
-
647
- // TODO Support corruption of the grid.
648
- assert(zone == .grid);
649
- return null;
650
- }
651
-
652
- fn fault_faulty_sectors(storage: *Storage, zone: vsr.Zone, offset_in_zone: u64, size: u64) void {
653
- const faulty = storage.faulty_sectors(zone, offset_in_zone, size) orelse return;
654
- const target_sector_min = @divExact(zone.offset(offset_in_zone), constants.sector_size);
655
- const target_sector_max = target_sector_min + @divExact(size, constants.sector_size);
656
- assert(faulty.min < faulty.max);
657
- assert(faulty.min >= target_sector_min);
658
- assert(faulty.max <= target_sector_max);
659
-
660
- // Randomly corrupt one of the faulty sectors the operation targeted.
661
- // TODO: inject more realistic and varied storage faults as described above.
662
- storage.fault_sector(zone, storage.random_uint_between(usize, faulty.min, faulty.max));
663
- }
664
-
665
- fn fault_sector(storage: *Storage, zone: vsr.Zone, sector: usize) void {
666
- storage.faults.set(sector);
667
- if (storage.options.replica_index) |replica_index| {
668
- log.debug("{}: corrupting sector at zone={} offset={}", .{
669
- replica_index,
670
- zone,
671
- sector * constants.sector_size - zone.offset(0),
672
- });
673
- }
674
- }
675
-
676
- fn verify_bounds_and_alignment(storage: *const Storage, buffer: []const u8, offset: u64) void {
677
- assert(buffer.len > 0);
678
- assert(offset + buffer.len <= storage.size);
679
-
680
- // Ensure that the read or write is aligned correctly for Direct I/O:
681
- // If this is not the case, the underlying syscall will return EINVAL.
682
- assert(@mod(@ptrToInt(buffer.ptr), constants.sector_size) == 0);
683
- assert(@mod(buffer.len, constants.sector_size) == 0);
684
- assert(@mod(offset, constants.sector_size) == 0);
685
- }
686
-
687
- /// Each redundant header written must either:
688
- /// - match the corresponding (already written) prepare, or
689
- /// - be a command=reserved header (due to Journal.remove_entries_from), or
690
- /// - match the old redundant header (i.e. no change).
691
- /// This last case applies when an in-memory header is changed after the prepare is written
692
- /// but before the redundant header is written, so the journal defers the redundant header
693
- /// update until after the new prepare has been written.
694
- fn verify_write_wal_header(storage: *const Storage, header: vsr.Header) void {
695
- // The checksum is zero when writing the header of a faulty prepare.
696
- if (header.checksum == 0) return;
697
-
698
- const header_slot = header.op % constants.journal_slot_count;
699
- const header_old = storage.wal_headers()[header_slot];
700
-
701
- const prepare_header = storage.wal_prepares()[header_slot].header;
702
- const prepare_offset = vsr.Zone.wal_prepares.offset(header_slot * constants.message_size_max);
703
- const prepare_sector = @divExact(prepare_offset, constants.sector_size);
704
-
705
- assert(storage.memory_written.isSet(prepare_sector));
706
- if (header.command == .prepare) {
707
- assert(header.checksum == header_old.checksum or
708
- header.checksum == prepare_header.checksum);
709
- } else {
710
- assert(header.command == .reserved);
711
- }
712
- }
713
-
714
- /// When a SuperBlock sector is written, verify:
715
- ///
716
- /// - There are no other pending writes or reads to the superblock zone.
717
- /// - All trailers are written.
718
- /// - All trailers' checksums validate.
719
- /// - All blocks referenced by the Manifest trailer exist.
720
- ///
721
- fn verify_write_superblock(storage: *const Storage, buffer: []const u8, offset_in_zone: u64) void {
722
- const Layout = superblock.Layout;
723
- assert(offset_in_zone < vsr.Zone.superblock.size().?);
724
-
725
- // Ignore trailer writes; only check the superblock sector writes.
726
- if (buffer.len != @sizeOf(superblock.SuperBlockSector)) return;
727
- var copy_: u8 = 0;
728
- while (copy_ < constants.superblock_copies) : (copy_ += 1) {
729
- if (Layout.offset_sector(copy_) == offset_in_zone) break;
730
- } else return;
731
-
732
- for (storage.reads.items[0..storage.reads.len]) |read| assert(read.zone != .superblock);
733
- for (storage.writes.items[0..storage.writes.len]) |write| assert(write.zone != .superblock);
734
-
735
- const sector = mem.bytesAsSlice(superblock.SuperBlockSector, buffer)[0];
736
- assert(sector.valid_checksum());
737
- assert(sector.vsr_state.internally_consistent());
738
- assert(sector.copy == copy_);
739
-
740
- const manifest_offset = vsr.Zone.superblock.offset(Layout.offset_manifest(copy_));
741
- const manifest_buffer = storage.memory[manifest_offset..][0..sector.manifest_size];
742
- assert(vsr.checksum(manifest_buffer) == sector.manifest_checksum);
743
-
744
- const free_set_offset = vsr.Zone.superblock.offset(Layout.offset_free_set(copy_));
745
- const free_set_buffer = storage.memory[free_set_offset..][0..sector.free_set_size];
746
- assert(vsr.checksum(free_set_buffer) == sector.free_set_checksum);
747
-
748
- const client_table_offset = vsr.Zone.superblock.offset(Layout.offset_client_table(copy_));
749
- const client_table_buffer =
750
- storage.memory[client_table_offset..][0..sector.client_table_size];
751
- assert(vsr.checksum(client_table_buffer) == sector.client_table_checksum);
752
-
753
- const Manifest = superblock.SuperBlockManifest;
754
- var manifest = Manifest.init(
755
- storage.allocator,
756
- @divExact(
757
- superblock.superblock_trailer_manifest_size_max,
758
- Manifest.BlockReferenceSize,
759
- ),
760
- @import("../lsm/tree.zig").table_count_max,
761
- ) catch unreachable;
762
- defer manifest.deinit(storage.allocator);
763
-
764
- manifest.decode(manifest_buffer);
765
-
766
- for (manifest.addresses[0..manifest.count]) |block_address, i| {
767
- const block_offset = vsr.Zone.grid.offset((block_address - 1) * constants.block_size);
768
- const block_header = mem.bytesAsValue(
769
- vsr.Header,
770
- storage.memory[block_offset..][0..@sizeOf(vsr.Header)],
771
- );
772
- assert(block_header.op == block_address);
773
- assert(block_header.checksum == manifest.checksums[i]);
774
- assert(block_header.operation == BlockType.manifest.operation());
775
- }
776
- }
777
-
778
- pub fn superblock_sector(
779
- storage: *const Storage,
780
- copy_: u8,
781
- ) *const superblock.SuperBlockSector {
782
- const offset = vsr.Zone.superblock.offset(superblock.Layout.offset_sector(copy_));
783
- const bytes = storage.memory[offset..][0..@sizeOf(superblock.SuperBlockSector)];
784
- return mem.bytesAsValue(superblock.SuperBlockSector, bytes);
785
- }
786
-
787
- pub fn wal_headers(storage: *const Storage) []const vsr.Header {
788
- const offset = vsr.Zone.wal_headers.offset(0);
789
- const size = vsr.Zone.wal_headers.size().?;
790
- return mem.bytesAsSlice(vsr.Header, storage.memory[offset..][0..size]);
791
- }
792
-
793
- const MessageRaw = extern struct {
794
- header: vsr.Header,
795
- body: [constants.message_size_max - @sizeOf(vsr.Header)]u8,
796
-
797
- comptime {
798
- assert(@sizeOf(MessageRaw) * 8 == @bitSizeOf(MessageRaw));
799
- }
800
- };
801
-
802
- pub fn wal_prepares(storage: *const Storage) []const MessageRaw {
803
- const offset = vsr.Zone.wal_prepares.offset(0);
804
- const size = vsr.Zone.wal_prepares.size().?;
805
- return mem.bytesAsSlice(MessageRaw, storage.memory[offset..][0..size]);
806
- }
807
-
808
- pub fn grid_block(
809
- storage: *const Storage,
810
- address: u64,
811
- ) *align(constants.sector_size) [constants.block_size]u8 {
812
- assert(address > 0);
813
-
814
- const block_offset = vsr.Zone.grid.offset((address - 1) * constants.block_size);
815
- const block_header = mem.bytesToValue(
816
- vsr.Header,
817
- storage.memory[block_offset..][0..@sizeOf(vsr.Header)],
818
- );
819
- assert(storage.memory_written.isSet(@divExact(block_offset, constants.sector_size)));
820
- assert(block_header.valid_checksum());
821
- assert(block_header.size <= constants.block_size);
822
-
823
- return storage.memory[block_offset..][0..constants.block_size];
824
- }
825
- };
826
-
827
- const SuperBlockArea = struct {
828
- const Group = enum { sector, manifest, free_set, client_table };
829
-
830
- group: Group,
831
- copy: u8,
832
-
833
- fn to_offset(self: SuperBlockArea) u64 {
834
- return switch (self.group) {
835
- .sector => superblock.Layout.offset_sector(self.copy),
836
- .manifest => superblock.Layout.offset_manifest(self.copy),
837
- .free_set => superblock.Layout.offset_free_set(self.copy),
838
- .client_table => superblock.Layout.offset_client_table(self.copy),
839
- };
840
- }
841
-
842
- fn from_offset(offset: u64) SuperBlockArea {
843
- var copy: u8 = 0;
844
- while (copy < constants.superblock_copies) : (copy += 1) {
845
- for (std.enums.values(Group)) |group| {
846
- const area = SuperBlockArea{ .group = group, .copy = copy };
847
- if (area.to_offset() == offset) return area;
848
- }
849
- } else unreachable;
850
- }
851
- };
852
-
853
- test "SuperBlockArea" {
854
- var prng = std.rand.DefaultPrng.init(@intCast(u64, std.time.timestamp()));
855
- for (std.enums.values(SuperBlockArea.Group)) |group| {
856
- const area_expect = SuperBlockArea{
857
- .group = group,
858
- .copy = prng.random().uintLessThan(u8, constants.superblock_copies),
859
- };
860
- const area_actual = SuperBlockArea.from_offset(area_expect.to_offset());
861
-
862
- try std.testing.expectEqual(area_expect, area_actual);
863
- }
864
- }