tigerbeetle-node 0.11.12 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/README.md +212 -196
  2. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  3. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  4. package/dist/bin/aarch64-macos/client.node +0 -0
  5. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  6. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  7. package/dist/bin/x86_64-macos/client.node +0 -0
  8. package/dist/index.js +33 -1
  9. package/dist/index.js.map +1 -1
  10. package/package-lock.json +66 -0
  11. package/package.json +8 -17
  12. package/src/index.ts +56 -1
  13. package/src/node.zig +10 -9
  14. package/dist/.client.node.sha256 +0 -1
  15. package/scripts/build_lib.sh +0 -61
  16. package/scripts/download_node_headers.sh +0 -32
  17. package/src/tigerbeetle/scripts/benchmark.bat +0 -48
  18. package/src/tigerbeetle/scripts/benchmark.sh +0 -66
  19. package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
  20. package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
  21. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
  22. package/src/tigerbeetle/scripts/install.bat +0 -7
  23. package/src/tigerbeetle/scripts/install.sh +0 -21
  24. package/src/tigerbeetle/scripts/install_zig.bat +0 -113
  25. package/src/tigerbeetle/scripts/install_zig.sh +0 -90
  26. package/src/tigerbeetle/scripts/lint.zig +0 -199
  27. package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
  28. package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -48
  29. package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
  30. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
  31. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
  32. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
  33. package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
  34. package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
  35. package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
  36. package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
  37. package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
  38. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
  39. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
  40. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
  41. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
  42. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
  43. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
  44. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
  45. package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
  46. package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
  47. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
  48. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
  49. package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
  50. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  51. package/src/tigerbeetle/src/benchmark.zig +0 -314
  52. package/src/tigerbeetle/src/config.zig +0 -234
  53. package/src/tigerbeetle/src/constants.zig +0 -436
  54. package/src/tigerbeetle/src/ewah.zig +0 -286
  55. package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
  56. package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
  57. package/src/tigerbeetle/src/fifo.zig +0 -120
  58. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  59. package/src/tigerbeetle/src/io/darwin.zig +0 -814
  60. package/src/tigerbeetle/src/io/linux.zig +0 -1062
  61. package/src/tigerbeetle/src/io/test.zig +0 -643
  62. package/src/tigerbeetle/src/io/windows.zig +0 -1183
  63. package/src/tigerbeetle/src/io.zig +0 -34
  64. package/src/tigerbeetle/src/iops.zig +0 -107
  65. package/src/tigerbeetle/src/lsm/README.md +0 -308
  66. package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
  67. package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
  68. package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
  69. package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
  70. package/src/tigerbeetle/src/lsm/direction.zig +0 -11
  71. package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
  72. package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
  73. package/src/tigerbeetle/src/lsm/forest.zig +0 -204
  74. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -401
  75. package/src/tigerbeetle/src/lsm/grid.zig +0 -573
  76. package/src/tigerbeetle/src/lsm/groove.zig +0 -972
  77. package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
  78. package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
  79. package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
  80. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -877
  81. package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
  82. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
  83. package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
  84. package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
  85. package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -378
  86. package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1328
  87. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
  88. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
  89. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
  90. package/src/tigerbeetle/src/lsm/table.zig +0 -1031
  91. package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -203
  92. package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
  93. package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -220
  94. package/src/tigerbeetle/src/lsm/test.zig +0 -438
  95. package/src/tigerbeetle/src/lsm/tree.zig +0 -1193
  96. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -474
  97. package/src/tigerbeetle/src/message_bus.zig +0 -1012
  98. package/src/tigerbeetle/src/message_pool.zig +0 -156
  99. package/src/tigerbeetle/src/ring_buffer.zig +0 -399
  100. package/src/tigerbeetle/src/simulator.zig +0 -569
  101. package/src/tigerbeetle/src/state_machine/auditor.zig +0 -577
  102. package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
  103. package/src/tigerbeetle/src/state_machine.zig +0 -1881
  104. package/src/tigerbeetle/src/static_allocator.zig +0 -65
  105. package/src/tigerbeetle/src/stdx.zig +0 -162
  106. package/src/tigerbeetle/src/storage.zig +0 -393
  107. package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
  108. package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
  109. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
  110. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
  111. package/src/tigerbeetle/src/testing/cluster.zig +0 -443
  112. package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
  113. package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
  114. package/src/tigerbeetle/src/testing/id.zig +0 -99
  115. package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -364
  116. package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
  117. package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
  118. package/src/tigerbeetle/src/testing/state_machine.zig +0 -249
  119. package/src/tigerbeetle/src/testing/storage.zig +0 -757
  120. package/src/tigerbeetle/src/testing/table.zig +0 -247
  121. package/src/tigerbeetle/src/testing/time.zig +0 -84
  122. package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
  123. package/src/tigerbeetle/src/time.zig +0 -112
  124. package/src/tigerbeetle/src/tracer.zig +0 -529
  125. package/src/tigerbeetle/src/unit_tests.zig +0 -42
  126. package/src/tigerbeetle/src/vopr.zig +0 -495
  127. package/src/tigerbeetle/src/vsr/README.md +0 -209
  128. package/src/tigerbeetle/src/vsr/client.zig +0 -544
  129. package/src/tigerbeetle/src/vsr/clock.zig +0 -853
  130. package/src/tigerbeetle/src/vsr/journal.zig +0 -2413
  131. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
  132. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  133. package/src/tigerbeetle/src/vsr/replica.zig +0 -6381
  134. package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
  135. package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
  136. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
  137. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
  138. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
  139. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
  140. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
  141. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
  142. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
  143. package/src/tigerbeetle/src/vsr.zig +0 -1352
@@ -1,757 +0,0 @@
1
- //! In-memory storage, with simulated faults and latency.
2
- //!
3
- //!
4
- //! Fault Injection
5
- //!
6
- //! Storage injects faults that the cluster can (i.e. should be able to) recover from.
7
- //! Each zone can tolerate a different pattern of faults.
8
- //!
9
- //! - superblock:
10
- //! - One read/write fault is permitted per area (section, manifest, …).
11
- //! - An additional fault is permitted at the target of a pending write during a crash.
12
- //!
13
- //! - wal_headers, wal_prepares:
14
- //! - Read/write faults are distributed between replicas according to ClusterFaultAtlas, to ensure
15
- //! that at least one replica will have a valid copy to help others repair.
16
- //! (See: generate_faulty_wal_areas()).
17
- //! - When a replica crashes, it may fault the WAL outside of ClusterFaultAtlas.
18
- //! - When replica_count=1, its WAL can only be corrupted by a crash, never a read/write.
19
- //! (When replica_count=1, there are no other replicas to assist with repair).
20
- //!
21
- //! - grid: (TODO: Enable grid faults when grid repair is implemented).
22
- //!
23
- const std = @import("std");
24
- const assert = std.debug.assert;
25
- const math = std.math;
26
- const mem = std.mem;
27
-
28
- const FIFO = @import("../fifo.zig").FIFO;
29
- const constants = @import("../constants.zig");
30
- const vsr = @import("../vsr.zig");
31
- const superblock = @import("../vsr/superblock.zig");
32
- const BlockType = @import("../lsm/grid.zig").BlockType;
33
- const stdx = @import("../stdx.zig");
34
- const PriorityQueue = @import("./priority_queue.zig").PriorityQueue;
35
- const fuzz = @import("./fuzz.zig");
36
- const hash_log = @import("./hash_log.zig");
37
-
38
- const log = std.log.scoped(.storage);
39
-
40
- // TODOs:
41
- // less than a majority of replicas may have corruption
42
- // have an option to enable/disable the following corruption types:
43
- // bitrot
44
- // misdirected read/write
45
- // corrupt sector
46
- // latent sector error
47
- // - emulate by zeroing sector, as this is how we handle this in the real Storage implementation
48
- // - likely that surrounding sectors also corrupt
49
- // - likely that stuff written at the same time is also corrupt even if written to a far away sector
50
- pub const Storage = struct {
51
- /// Options for fault injection during fuzz testing
52
- pub const Options = struct {
53
- /// Seed for the storage PRNG.
54
- seed: u64 = 0,
55
-
56
- /// Required when `fault_atlas` is set.
57
- replica_index: ?u8 = null,
58
-
59
- /// Minimum number of ticks it may take to read data.
60
- read_latency_min: u64,
61
- /// Average number of ticks it may take to read data. Must be >= read_latency_min.
62
- read_latency_mean: u64,
63
- /// Minimum number of ticks it may take to write data.
64
- write_latency_min: u64,
65
- /// Average number of ticks it may take to write data. Must be >= write_latency_min.
66
- write_latency_mean: u64,
67
-
68
- /// Chance out of 100 that a read will corrupt a sector, if the target memory is within
69
- /// a faulty area of this replica.
70
- read_fault_probability: u8 = 0,
71
- /// Chance out of 100 that a write will corrupt a sector, if the target memory is within
72
- /// a faulty area of this replica.
73
- write_fault_probability: u8 = 0,
74
- /// Chance out of 100 that a crash will corrupt a sector of a pending write's target,
75
- /// if the target memory is within a faulty area of this replica.
76
- crash_fault_probability: u8 = 0,
77
-
78
- /// Enable/disable automatic read/write faults.
79
- /// Does not impact crash faults or manual faults.
80
- fault_atlas: ?*const ClusterFaultAtlas = null,
81
- };
82
-
83
- /// See usage in Journal.write_sectors() for details.
84
- /// TODO: allow testing in both modes.
85
- pub const synchronicity: enum {
86
- always_synchronous,
87
- always_asynchronous,
88
- } = .always_asynchronous;
89
-
90
- pub const Read = struct {
91
- callback: fn (read: *Storage.Read) void,
92
- buffer: []u8,
93
- zone: vsr.Zone,
94
- /// Relative offset within the zone.
95
- offset: u64,
96
- /// Tick at which this read is considered "completed" and the callback should be called.
97
- done_at_tick: u64,
98
-
99
- fn less_than(context: void, a: *Read, b: *Read) math.Order {
100
- _ = context;
101
-
102
- return math.order(a.done_at_tick, b.done_at_tick);
103
- }
104
- };
105
-
106
- pub const Write = struct {
107
- callback: fn (write: *Storage.Write) void,
108
- buffer: []const u8,
109
- zone: vsr.Zone,
110
- /// Relative offset within the zone.
111
- offset: u64,
112
- /// Tick at which this write is considered "completed" and the callback should be called.
113
- done_at_tick: u64,
114
-
115
- fn less_than(context: void, a: *Write, b: *Write) math.Order {
116
- _ = context;
117
-
118
- return math.order(a.done_at_tick, b.done_at_tick);
119
- }
120
- };
121
-
122
- pub const NextTick = struct {
123
- next: ?*NextTick = null,
124
- callback: fn (next_tick: *NextTick) void,
125
- };
126
-
127
- allocator: mem.Allocator,
128
-
129
- size: u64,
130
- options: Options,
131
- prng: std.rand.DefaultPrng,
132
-
133
- memory: []align(constants.sector_size) u8,
134
- /// Set bits correspond to sectors that have ever been written to.
135
- memory_written: std.DynamicBitSetUnmanaged,
136
- /// Set bits correspond to faulty sectors. The underlying sectors of `memory` is left clean.
137
- faults: std.DynamicBitSetUnmanaged,
138
-
139
- /// Whether to enable faults (when false, this supersedes `faulty_wal_areas`).
140
- /// This is used to disable faults during the replica's first startup.
141
- faulty: bool = true,
142
-
143
- reads: PriorityQueue(*Storage.Read, void, Storage.Read.less_than),
144
- writes: PriorityQueue(*Storage.Write, void, Storage.Write.less_than),
145
-
146
- ticks: u64 = 0,
147
- next_tick_queue: FIFO(NextTick) = .{},
148
-
149
- pub fn init(allocator: mem.Allocator, size: u64, options: Storage.Options) !Storage {
150
- assert(options.write_latency_mean >= options.write_latency_min);
151
- assert(options.read_latency_mean >= options.read_latency_min);
152
- assert(options.fault_atlas == null or options.replica_index != null);
153
-
154
- var prng = std.rand.DefaultPrng.init(options.seed);
155
- const sector_count = @divExact(size, constants.sector_size);
156
- const memory = try allocator.allocAdvanced(u8, constants.sector_size, size, .exact);
157
- errdefer allocator.free(memory);
158
- // TODO: random data
159
- mem.set(u8, memory, 0);
160
-
161
- var memory_written = try std.DynamicBitSetUnmanaged.initEmpty(allocator, sector_count);
162
- errdefer memory_written.deinit(allocator);
163
-
164
- var faults = try std.DynamicBitSetUnmanaged.initEmpty(allocator, sector_count);
165
- errdefer faults.deinit(allocator);
166
-
167
- var reads = PriorityQueue(*Storage.Read, void, Storage.Read.less_than).init(allocator, {});
168
- errdefer reads.deinit();
169
- try reads.ensureTotalCapacity(constants.iops_read_max);
170
-
171
- var writes = PriorityQueue(*Storage.Write, void, Storage.Write.less_than).init(allocator, {});
172
- errdefer writes.deinit();
173
- try writes.ensureTotalCapacity(constants.iops_write_max);
174
-
175
- return Storage{
176
- .allocator = allocator,
177
- .size = size,
178
- .options = options,
179
- .prng = prng,
180
- .memory = memory,
181
- .memory_written = memory_written,
182
- .faults = faults,
183
- .reads = reads,
184
- .writes = writes,
185
- };
186
- }
187
-
188
- pub fn deinit(storage: *Storage, allocator: mem.Allocator) void {
189
- allocator.free(storage.memory);
190
- storage.memory_written.deinit(allocator);
191
- storage.faults.deinit(allocator);
192
- storage.reads.deinit();
193
- storage.writes.deinit();
194
- }
195
-
196
- /// Cancel any currently in-progress reads/writes.
197
- /// Corrupt the target sectors of any in-progress writes.
198
- pub fn reset(storage: *Storage) void {
199
- while (storage.writes.peek()) |_| {
200
- const write = storage.writes.remove();
201
- if (!storage.x_in_100(storage.options.crash_fault_probability)) continue;
202
-
203
- // Randomly corrupt one of the faulty sectors the operation targeted.
204
- // TODO: inject more realistic and varied storage faults as described above.
205
- const sectors = SectorRange.from_zone(write.zone, write.offset, write.buffer.len);
206
- storage.fault_sector(write.zone, sectors.random(storage.prng.random()));
207
- }
208
- assert(storage.writes.len == 0);
209
-
210
- storage.reads.len = 0;
211
- storage.next_tick_queue = .{};
212
- }
213
-
214
- /// Returns the number of bytes that have been written to, assuming that (the simulated)
215
- /// `fallocate()` creates a sparse file.
216
- pub fn size_used(storage: *const Storage) usize {
217
- return storage.memory_written.count() * constants.sector_size;
218
- }
219
-
220
- /// Copy state from `origin` to `storage`:
221
- ///
222
- /// - ticks
223
- /// - memory
224
- /// - occupied memory
225
- /// - faulty sectors
226
- /// - reads in-progress
227
- /// - writes in-progress
228
- ///
229
- /// Both instances must have an identical size.
230
- pub fn copy(storage: *Storage, origin: *const Storage) void {
231
- assert(storage.size == origin.size);
232
-
233
- storage.ticks = origin.ticks;
234
- stdx.copy_disjoint(.exact, u8, storage.memory, origin.memory);
235
- storage.memory_written.toggleSet(storage.memory_written);
236
- storage.memory_written.toggleSet(origin.memory_written);
237
- storage.faults.toggleSet(storage.faults);
238
- storage.faults.toggleSet(origin.faults);
239
-
240
- storage.reads.len = 0;
241
- for (origin.reads.items[0..origin.reads.len]) |read| {
242
- storage.reads.add(read) catch unreachable;
243
- }
244
-
245
- storage.writes.len = 0;
246
- for (origin.writes.items[0..origin.writes.len]) |write| {
247
- storage.writes.add(write) catch unreachable;
248
- }
249
- }
250
-
251
- pub fn tick(storage: *Storage) void {
252
- storage.ticks += 1;
253
-
254
- while (storage.reads.peek()) |read| {
255
- if (read.done_at_tick > storage.ticks) break;
256
- _ = storage.reads.remove();
257
- storage.read_sectors_finish(read);
258
- }
259
-
260
- while (storage.writes.peek()) |write| {
261
- if (write.done_at_tick > storage.ticks) break;
262
- _ = storage.writes.remove();
263
- storage.write_sectors_finish(write);
264
- }
265
-
266
- while (storage.next_tick_queue.pop()) |next_tick| {
267
- next_tick.callback(next_tick);
268
- }
269
- }
270
-
271
- pub fn on_next_tick(
272
- storage: *Storage,
273
- callback: fn (next_tick: *Storage.NextTick) void,
274
- next_tick: *Storage.NextTick,
275
- ) void {
276
- next_tick.* = .{ .callback = callback };
277
- storage.next_tick_queue.push(next_tick);
278
- }
279
-
280
- /// * Verifies that the read fits within the target sector.
281
- /// * Verifies that the read targets sectors that have been written to.
282
- pub fn read_sectors(
283
- storage: *Storage,
284
- callback: fn (read: *Storage.Read) void,
285
- read: *Storage.Read,
286
- buffer: []u8,
287
- zone: vsr.Zone,
288
- offset_in_zone: u64,
289
- ) void {
290
- hash_log.emit_autohash(.{ buffer, zone, offset_in_zone }, .DeepRecursive);
291
-
292
- verify_alignment(buffer);
293
-
294
- var sectors = SectorRange.from_zone(zone, offset_in_zone, buffer.len);
295
- while (sectors.next()) |sector| assert(storage.memory_written.isSet(sector));
296
-
297
- read.* = .{
298
- .callback = callback,
299
- .buffer = buffer,
300
- .zone = zone,
301
- .offset = offset_in_zone,
302
- .done_at_tick = storage.ticks + storage.read_latency(),
303
- };
304
-
305
- // We ensure the capacity is sufficient for constants.iops_read_max in init()
306
- storage.reads.add(read) catch unreachable;
307
- }
308
-
309
- fn read_sectors_finish(storage: *Storage, read: *Storage.Read) void {
310
- hash_log.emit_autohash(.{ read.buffer, read.zone, read.offset }, .DeepRecursive);
311
-
312
- const offset_in_storage = read.zone.offset(read.offset);
313
- stdx.copy_disjoint(
314
- .exact,
315
- u8,
316
- read.buffer,
317
- storage.memory[offset_in_storage..][0..read.buffer.len],
318
- );
319
-
320
- if (storage.x_in_100(storage.options.read_fault_probability)) {
321
- storage.fault_faulty_sectors(read.zone, read.offset, read.buffer.len);
322
- }
323
-
324
- if (storage.faulty) {
325
- // Corrupt faulty sectors.
326
- var sectors = SectorRange.from_zone(read.zone, read.offset, read.buffer.len);
327
- const sectors_min = sectors.min;
328
- while (sectors.next()) |sector| {
329
- if (storage.faults.isSet(sector)) {
330
- const faulty_sector_offset = (sector - sectors_min) * constants.sector_size;
331
- const faulty_sector_bytes = read.buffer[faulty_sector_offset..][0..constants.sector_size];
332
- storage.prng.random().bytes(faulty_sector_bytes);
333
- }
334
- }
335
- }
336
-
337
- read.callback(read);
338
- }
339
-
340
- pub fn write_sectors(
341
- storage: *Storage,
342
- callback: fn (write: *Storage.Write) void,
343
- write: *Storage.Write,
344
- buffer: []const u8,
345
- zone: vsr.Zone,
346
- offset_in_zone: u64,
347
- ) void {
348
- hash_log.emit_autohash(.{ buffer, zone, offset_in_zone }, .DeepRecursive);
349
-
350
- verify_alignment(buffer);
351
-
352
- // Verify that there are no concurrent overlapping writes.
353
- var iterator = storage.writes.iterator();
354
- while (iterator.next()) |other| {
355
- if (other.zone != zone) continue;
356
- assert(offset_in_zone + buffer.len <= other.offset or
357
- other.offset + other.buffer.len <= offset_in_zone);
358
- }
359
-
360
- write.* = .{
361
- .callback = callback,
362
- .buffer = buffer,
363
- .zone = zone,
364
- .offset = offset_in_zone,
365
- .done_at_tick = storage.ticks + storage.write_latency(),
366
- };
367
-
368
- // We ensure the capacity is sufficient for constants.iops_write_max in init()
369
- storage.writes.add(write) catch unreachable;
370
- }
371
-
372
- fn write_sectors_finish(storage: *Storage, write: *Storage.Write) void {
373
- hash_log.emit_autohash(.{ write.buffer, write.zone, write.offset }, .DeepRecursive);
374
-
375
- const offset_in_storage = write.zone.offset(write.offset);
376
- stdx.copy_disjoint(
377
- .exact,
378
- u8,
379
- storage.memory[offset_in_storage..][0..write.buffer.len],
380
- write.buffer,
381
- );
382
-
383
- var sectors = SectorRange.from_zone(write.zone, write.offset, write.buffer.len);
384
- while (sectors.next()) |sector| {
385
- storage.faults.unset(sector);
386
- storage.memory_written.set(sector);
387
- }
388
-
389
- if (storage.x_in_100(storage.options.write_fault_probability)) {
390
- storage.fault_faulty_sectors(write.zone, write.offset, write.buffer.len);
391
- }
392
-
393
- write.callback(write);
394
- }
395
-
396
- fn read_latency(storage: *Storage) u64 {
397
- return storage.latency(storage.options.read_latency_min, storage.options.read_latency_mean);
398
- }
399
-
400
- fn write_latency(storage: *Storage) u64 {
401
- return storage.latency(storage.options.write_latency_min, storage.options.write_latency_mean);
402
- }
403
-
404
- fn latency(storage: *Storage, min: u64, mean: u64) u64 {
405
- return min + fuzz.random_int_exponential(storage.prng.random(), u64, mean - min);
406
- }
407
-
408
- /// Return true with probability x/100.
409
- fn x_in_100(storage: *Storage, x: u8) bool {
410
- assert(x <= 100);
411
- return x > storage.prng.random().uintLessThan(u8, 100);
412
- }
413
-
414
- fn fault_faulty_sectors(storage: *Storage, zone: vsr.Zone, offset_in_zone: u64, size: u64) void {
415
- const atlas = storage.options.fault_atlas orelse return;
416
- const replica_index = storage.options.replica_index.?;
417
- const faulty_sectors = switch (zone) {
418
- .superblock => atlas.faulty_superblock(replica_index, offset_in_zone, size),
419
- .wal_headers => atlas.faulty_wal_headers(replica_index, offset_in_zone, size),
420
- .wal_prepares => atlas.faulty_wal_prepares(replica_index, offset_in_zone, size),
421
- .grid => null,
422
- } orelse return;
423
-
424
- // Randomly corrupt one of the faulty sectors the operation targeted.
425
- // TODO: inject more realistic and varied storage faults as described above.
426
- storage.fault_sector(zone, faulty_sectors.random(storage.prng.random()));
427
- }
428
-
429
- fn fault_sector(storage: *Storage, zone: vsr.Zone, sector: usize) void {
430
- storage.faults.set(sector);
431
- if (storage.options.replica_index) |replica_index| {
432
- log.debug("{}: corrupting sector at zone={} offset={}", .{
433
- replica_index,
434
- zone,
435
- sector * constants.sector_size - zone.offset(0),
436
- });
437
- }
438
- }
439
-
440
- pub fn superblock_header(
441
- storage: *const Storage,
442
- copy_: u8,
443
- ) *const superblock.SuperBlockHeader {
444
- const offset = vsr.Zone.superblock.offset(superblock.areas.header.offset(copy_));
445
- const bytes = storage.memory[offset..][0..superblock.areas.header.size_max];
446
- return mem.bytesAsValue(superblock.SuperBlockHeader, bytes);
447
- }
448
-
449
- pub fn wal_headers(storage: *const Storage) []const vsr.Header {
450
- const offset = vsr.Zone.wal_headers.offset(0);
451
- const size = vsr.Zone.wal_headers.size().?;
452
- return mem.bytesAsSlice(vsr.Header, storage.memory[offset..][0..size]);
453
- }
454
-
455
- const MessageRaw = extern struct {
456
- header: vsr.Header,
457
- body: [constants.message_size_max - @sizeOf(vsr.Header)]u8,
458
-
459
- comptime {
460
- assert(@sizeOf(MessageRaw) == constants.message_size_max);
461
- assert(@sizeOf(MessageRaw) * 8 == @bitSizeOf(MessageRaw));
462
- }
463
- };
464
-
465
- pub fn wal_prepares(storage: *const Storage) []const MessageRaw {
466
- const offset = vsr.Zone.wal_prepares.offset(0);
467
- const size = vsr.Zone.wal_prepares.size().?;
468
- return mem.bytesAsSlice(MessageRaw, storage.memory[offset..][0..size]);
469
- }
470
-
471
- pub fn grid_block(
472
- storage: *const Storage,
473
- address: u64,
474
- ) *align(constants.sector_size) [constants.block_size]u8 {
475
- assert(address > 0);
476
-
477
- const block_offset = vsr.Zone.grid.offset((address - 1) * constants.block_size);
478
- const block_header = mem.bytesToValue(
479
- vsr.Header,
480
- storage.memory[block_offset..][0..@sizeOf(vsr.Header)],
481
- );
482
- assert(storage.memory_written.isSet(@divExact(block_offset, constants.sector_size)));
483
- assert(block_header.valid_checksum());
484
- assert(block_header.size <= constants.block_size);
485
-
486
- return storage.memory[block_offset..][0..constants.block_size];
487
- }
488
- };
489
-
490
- fn verify_alignment(buffer: []const u8) void {
491
- assert(buffer.len > 0);
492
-
493
- // Ensure that the read or write is aligned correctly for Direct I/O:
494
- // If this is not the case, the underlying syscall will return EINVAL.
495
- assert(@mod(@ptrToInt(buffer.ptr), constants.sector_size) == 0);
496
- assert(@mod(buffer.len, constants.sector_size) == 0);
497
- }
498
-
499
- pub const Area = union(enum) {
500
- superblock: struct { area: superblock.Area, copy: u8 },
501
- wal_headers: struct { sector: usize },
502
- wal_prepares: struct { slot: usize },
503
- grid: struct { address: u64 },
504
-
505
- fn sectors(area: Area) SectorRange {
506
- switch (area) {
507
- .superblock => |data| SectorRange.from_zone(
508
- .superblock,
509
- @field(superblock.areas, data.area).offset(data.copy),
510
- @field(superblock.areas, data.area).size_max,
511
- ),
512
- .wal_headers => |data| SectorRange.from_zone(
513
- .wal_headers,
514
- constants.sector_size * data.sector,
515
- constants.sector_size,
516
- ),
517
- .wal_prepares => |data| SectorRange.from_zone(
518
- .wal_prepares,
519
- constants.message_size_max * data.slot,
520
- constants.message_size_max,
521
- ),
522
- .grid => |data| SectorRange.from_zone(
523
- .grid,
524
- constants.block_size * (data.address - 1),
525
- constants.block_size,
526
- ),
527
- }
528
- }
529
- };
530
-
531
- const SectorRange = struct {
532
- min: usize, // inclusive sector index
533
- max: usize, // exclusive sector index
534
-
535
- fn from_zone(
536
- zone: vsr.Zone,
537
- offset_in_zone: u64,
538
- size: usize,
539
- ) SectorRange {
540
- return from_offset(zone.offset(offset_in_zone), size);
541
- }
542
-
543
- fn from_offset(offset_in_storage: u64, size: usize) SectorRange {
544
- return .{
545
- .min = @divExact(offset_in_storage, constants.sector_size),
546
- .max = @divExact(offset_in_storage + size, constants.sector_size),
547
- };
548
- }
549
-
550
- fn random(range: SectorRange, rand: std.rand.Random) usize {
551
- return range.min + rand.uintLessThan(usize, range.max - range.min);
552
- }
553
-
554
- fn next(range: *SectorRange) ?usize {
555
- if (range.min == range.max) return null;
556
- defer range.min += 1;
557
- return range.min;
558
- }
559
-
560
- fn intersect(a: SectorRange, b: SectorRange) ?SectorRange {
561
- if (a.max <= b.min) return null;
562
- if (b.max <= a.min) return null;
563
- return SectorRange{
564
- .min = std.math.max(a.min, b.min),
565
- .max = std.math.min(a.max, b.max),
566
- };
567
- }
568
- };
569
-
570
- /// To ensure the cluster can recover, each header/prepare/block must be valid (not faulty) at
571
- /// a majority of replicas.
572
- ///
573
- /// We can't allow WAL storage faults for the same message in a majority of
574
- /// the replicas as that would make recovery impossible. Instead, we only
575
- /// allow faults in certain areas which differ between replicas.
576
- // TODO Support total superblock corruption, forcing a full state transfer.
577
- pub const ClusterFaultAtlas = struct {
578
- pub const Options = struct {
579
- faulty_superblock: bool,
580
- faulty_wal_headers: bool,
581
- faulty_wal_prepares: bool,
582
- // TODO grid
583
- };
584
-
585
- /// This is the maximum number of faults per-trailer-area that can be safely injected on a read
586
- /// or write to the superblock zone.
587
- ///
588
- /// It does not include the additional "torn write" fault injected upon a crash.
589
- ///
590
- /// For SuperBlockHeader, checkpoint() and view_change() require 3/4 valid headers (1
591
- /// fault). Trailers are likewise 3/4 + 1 fault — consider if two faults were injected:
592
- /// 1. `SuperBlock.checkpoint()` for sequence=6.
593
- /// - write copy 0, corrupt manifest (fault_count=1)
594
- /// - write copy 1, corrupt manifest (fault_count=2) !
595
- /// 2. Crash. Recover.
596
- /// 3. `SuperBlock.open()`. The highest valid quorum is sequence=6, but there is no
597
- /// valid manifest.
598
- const superblock_trailer_faults_max = @divExact(constants.superblock_copies, 2) - 1;
599
-
600
- comptime {
601
- assert(superblock_trailer_faults_max >= 1);
602
- }
603
-
604
- const CopySet = std.StaticBitSet(constants.superblock_copies);
605
- const ReplicaSet = std.StaticBitSet(constants.replicas_max);
606
- const headers_per_sector = @divExact(constants.sector_size, @sizeOf(vsr.Header));
607
- const header_sectors = @divExact(constants.journal_slot_count, headers_per_sector);
608
-
609
- const FaultySuperBlockAreas = std.enums.EnumArray(superblock.Area, CopySet);
610
- const FaultyWALHeaders = std.StaticBitSet(@divExact(
611
- constants.journal_size_headers,
612
- constants.sector_size,
613
- ));
614
-
615
- options: Options,
616
- faulty_superblock_areas: FaultySuperBlockAreas =
617
- FaultySuperBlockAreas.initFill(CopySet.initEmpty()),
618
- faulty_wal_header_sectors: [constants.replicas_max]FaultyWALHeaders =
619
- [_]FaultyWALHeaders{FaultyWALHeaders.initEmpty()} ** constants.replicas_max,
620
-
621
- pub fn init(replica_count: u8, random: std.rand.Random, options: Options) ClusterFaultAtlas {
622
- // If there is only one replica in the cluster, WAL/Grid faults are not recoverable.
623
- assert(replica_count > 1 or options.faulty_wal_headers == false);
624
- assert(replica_count > 1 or options.faulty_wal_prepares == false);
625
-
626
- var atlas = ClusterFaultAtlas{ .options = options };
627
-
628
- for (&atlas.faulty_superblock_areas.values) |*copies, area| {
629
- if (area == @enumToInt(superblock.Area.header)) {
630
- // Only inject read/write faults into trailers, not the header.
631
- // This prevents the quorum from being lost like so:
632
- // - copy₀: B (ok)
633
- // - copy₁: B (torn write)
634
- // - copy₂: A (corrupt)
635
- // - copy₃: A (ok)
636
- } else {
637
- var area_faults: usize = 0;
638
- while (area_faults < superblock_trailer_faults_max) : (area_faults += 1) {
639
- copies.set(random.uintLessThan(usize, constants.superblock_copies));
640
- }
641
- }
642
- }
643
-
644
- // A cluster-of-2 is special-cased to mirror the special case in replica.zig.
645
- // See repair_prepare()/on_nack_prepare().
646
- const quorums = vsr.quorums(replica_count);
647
- const faults_max = if (replica_count == 2) 1 else replica_count - quorums.replication;
648
- assert(faults_max < replica_count);
649
- assert(faults_max > 0 or replica_count == 1);
650
-
651
- var wal_header_sectors = [_]ReplicaSet{ReplicaSet.initEmpty()} ** header_sectors;
652
- for (wal_header_sectors) |*wal_header_sector, sector| {
653
- while (wal_header_sector.count() < faults_max) {
654
- const replica_index = random.uintLessThan(u8, replica_count);
655
- wal_header_sector.set(replica_index);
656
- atlas.faulty_wal_header_sectors[replica_index].set(sector);
657
- }
658
- }
659
-
660
- return atlas;
661
- }
662
-
663
- /// Returns a range of faulty sectors which intersect the specified range.
664
- fn faulty_superblock(
665
- atlas: ClusterFaultAtlas,
666
- replica_index: usize,
667
- offset_in_zone: u64,
668
- size: u64,
669
- ) ?SectorRange {
670
- _ = replica_index;
671
- if (!atlas.options.faulty_superblock) return null;
672
-
673
- const copy = @divFloor(offset_in_zone, superblock.superblock_copy_size);
674
- const offset_in_copy = offset_in_zone % superblock.superblock_copy_size;
675
- const area: superblock.Area = switch (offset_in_copy) {
676
- superblock.areas.header.base => .header,
677
- superblock.areas.manifest.base => .manifest,
678
- superblock.areas.free_set.base => .free_set,
679
- superblock.areas.client_table.base => .client_table,
680
- else => unreachable,
681
- };
682
-
683
- if (atlas.faulty_superblock_areas.get(area).isSet(copy)) {
684
- return SectorRange.from_zone(.superblock, offset_in_zone, size);
685
- } else {
686
- return null;
687
- }
688
- }
689
-
690
- /// Returns a range of faulty sectors which intersect the specified range.
691
- fn faulty_wal_headers(
692
- atlas: ClusterFaultAtlas,
693
- replica_index: usize,
694
- offset_in_zone: u64,
695
- size: u64,
696
- ) ?SectorRange {
697
- if (!atlas.options.faulty_wal_headers) return null;
698
- return faulty_sectors(
699
- FaultyWALHeaders.bit_length,
700
- constants.sector_size,
701
- .wal_headers,
702
- &atlas.faulty_wal_header_sectors[replica_index],
703
- offset_in_zone,
704
- size,
705
- );
706
- }
707
-
708
- /// Returns a range of faulty sectors which intersect the specified range.
709
- fn faulty_wal_prepares(
710
- atlas: ClusterFaultAtlas,
711
- replica_index: usize,
712
- offset_in_zone: u64,
713
- size: u64,
714
- ) ?SectorRange {
715
- if (!atlas.options.faulty_wal_prepares) return null;
716
- return faulty_sectors(
717
- FaultyWALHeaders.bit_length,
718
- constants.message_size_max * headers_per_sector,
719
- .wal_prepares,
720
- &atlas.faulty_wal_header_sectors[replica_index],
721
- offset_in_zone,
722
- size,
723
- );
724
- }
725
-
726
- fn faulty_sectors(
727
- comptime chunk_count: usize,
728
- comptime chunk_size: usize,
729
- comptime zone: vsr.Zone,
730
- faulty_chunks: *const std.StaticBitSet(chunk_count),
731
- offset_in_zone: u64,
732
- size: u64,
733
- ) ?SectorRange {
734
- var fault_start: ?usize = null;
735
- var fault_count: usize = 0;
736
-
737
- var chunk: usize = @divFloor(offset_in_zone, chunk_size);
738
- while (chunk * chunk_size < offset_in_zone + size) : (chunk += 1) {
739
- if (faulty_chunks.isSet(chunk)) {
740
- if (fault_start == null) fault_start = chunk;
741
- fault_count += 1;
742
- } else {
743
- if (fault_start != null) break;
744
- }
745
- }
746
-
747
- if (fault_start) |start| {
748
- return SectorRange.from_zone(
749
- zone,
750
- chunk_size * start,
751
- chunk_size * fault_count,
752
- ).intersect(SectorRange.from_zone(zone, offset_in_zone, size)).?;
753
- } else {
754
- return null;
755
- }
756
- }
757
- };