tigerbeetle-node 0.11.13 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/README.md +5 -10
  2. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  3. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  4. package/dist/bin/aarch64-macos/client.node +0 -0
  5. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  6. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  7. package/dist/bin/x86_64-macos/client.node +0 -0
  8. package/dist/index.js +33 -1
  9. package/dist/index.js.map +1 -1
  10. package/package-lock.json +66 -0
  11. package/package.json +6 -16
  12. package/src/index.ts +56 -1
  13. package/src/node.zig +9 -9
  14. package/dist/.client.node.sha256 +0 -1
  15. package/scripts/build_lib.sh +0 -61
  16. package/scripts/download_node_headers.sh +0 -32
  17. package/src/tigerbeetle/scripts/benchmark.bat +0 -55
  18. package/src/tigerbeetle/scripts/benchmark.sh +0 -66
  19. package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
  20. package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
  21. package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
  22. package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
  23. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
  24. package/src/tigerbeetle/scripts/install.bat +0 -7
  25. package/src/tigerbeetle/scripts/install.sh +0 -21
  26. package/src/tigerbeetle/scripts/install_zig.bat +0 -113
  27. package/src/tigerbeetle/scripts/install_zig.sh +0 -90
  28. package/src/tigerbeetle/scripts/lint.zig +0 -199
  29. package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
  30. package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
  31. package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
  32. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
  33. package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
  34. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
  35. package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
  36. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
  37. package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
  38. package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
  39. package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
  40. package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
  41. package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
  42. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
  43. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
  44. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
  45. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
  46. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
  47. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
  48. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
  49. package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
  50. package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
  51. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
  52. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
  53. package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
  54. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  55. package/src/tigerbeetle/src/benchmark.zig +0 -336
  56. package/src/tigerbeetle/src/config.zig +0 -233
  57. package/src/tigerbeetle/src/constants.zig +0 -428
  58. package/src/tigerbeetle/src/ewah.zig +0 -286
  59. package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
  60. package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
  61. package/src/tigerbeetle/src/fifo.zig +0 -120
  62. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  63. package/src/tigerbeetle/src/io/darwin.zig +0 -814
  64. package/src/tigerbeetle/src/io/linux.zig +0 -1071
  65. package/src/tigerbeetle/src/io/test.zig +0 -643
  66. package/src/tigerbeetle/src/io/windows.zig +0 -1183
  67. package/src/tigerbeetle/src/io.zig +0 -34
  68. package/src/tigerbeetle/src/iops.zig +0 -107
  69. package/src/tigerbeetle/src/lsm/README.md +0 -308
  70. package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
  71. package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
  72. package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
  73. package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
  74. package/src/tigerbeetle/src/lsm/direction.zig +0 -11
  75. package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
  76. package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
  77. package/src/tigerbeetle/src/lsm/forest.zig +0 -205
  78. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
  79. package/src/tigerbeetle/src/lsm/grid.zig +0 -573
  80. package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
  81. package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
  82. package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
  83. package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
  84. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
  85. package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
  86. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
  87. package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
  88. package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
  89. package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
  90. package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
  91. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
  92. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
  93. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
  94. package/src/tigerbeetle/src/lsm/table.zig +0 -1009
  95. package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
  96. package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
  97. package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
  98. package/src/tigerbeetle/src/lsm/test.zig +0 -439
  99. package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
  100. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
  101. package/src/tigerbeetle/src/message_bus.zig +0 -1013
  102. package/src/tigerbeetle/src/message_pool.zig +0 -156
  103. package/src/tigerbeetle/src/ring_buffer.zig +0 -399
  104. package/src/tigerbeetle/src/simulator.zig +0 -580
  105. package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
  106. package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
  107. package/src/tigerbeetle/src/state_machine.zig +0 -2099
  108. package/src/tigerbeetle/src/static_allocator.zig +0 -65
  109. package/src/tigerbeetle/src/stdx.zig +0 -171
  110. package/src/tigerbeetle/src/storage.zig +0 -393
  111. package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
  112. package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
  113. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
  114. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
  115. package/src/tigerbeetle/src/testing/cluster.zig +0 -444
  116. package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
  117. package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
  118. package/src/tigerbeetle/src/testing/id.zig +0 -99
  119. package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
  120. package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
  121. package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
  122. package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
  123. package/src/tigerbeetle/src/testing/storage.zig +0 -757
  124. package/src/tigerbeetle/src/testing/table.zig +0 -247
  125. package/src/tigerbeetle/src/testing/time.zig +0 -84
  126. package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
  127. package/src/tigerbeetle/src/time.zig +0 -112
  128. package/src/tigerbeetle/src/tracer.zig +0 -529
  129. package/src/tigerbeetle/src/unit_tests.zig +0 -40
  130. package/src/tigerbeetle/src/vopr.zig +0 -495
  131. package/src/tigerbeetle/src/vsr/README.md +0 -209
  132. package/src/tigerbeetle/src/vsr/client.zig +0 -544
  133. package/src/tigerbeetle/src/vsr/clock.zig +0 -855
  134. package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
  135. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
  136. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  137. package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
  138. package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
  139. package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
  140. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
  141. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
  142. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
  143. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
  144. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
  145. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
  146. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
  147. package/src/tigerbeetle/src/vsr.zig +0 -1425
@@ -1,757 +0,0 @@
1
- //! In-memory storage, with simulated faults and latency.
2
- //!
3
- //!
4
- //! Fault Injection
5
- //!
6
- //! Storage injects faults that the cluster can (i.e. should be able to) recover from.
7
- //! Each zone can tolerate a different pattern of faults.
8
- //!
9
- //! - superblock:
10
- //! - One read/write fault is permitted per area (section, manifest, …).
11
- //! - An additional fault is permitted at the target of a pending write during a crash.
12
- //!
13
- //! - wal_headers, wal_prepares:
14
- //! - Read/write faults are distributed between replicas according to ClusterFaultAtlas, to ensure
15
- //! that at least one replica will have a valid copy to help others repair.
16
- //! (See: generate_faulty_wal_areas()).
17
- //! - When a replica crashes, it may fault the WAL outside of ClusterFaultAtlas.
18
- //! - When replica_count=1, its WAL can only be corrupted by a crash, never a read/write.
19
- //! (When replica_count=1, there are no other replicas to assist with repair).
20
- //!
21
- //! - grid: (TODO: Enable grid faults when grid repair is implemented).
22
- //!
23
- const std = @import("std");
24
- const assert = std.debug.assert;
25
- const math = std.math;
26
- const mem = std.mem;
27
-
28
- const FIFO = @import("../fifo.zig").FIFO;
29
- const constants = @import("../constants.zig");
30
- const vsr = @import("../vsr.zig");
31
- const superblock = @import("../vsr/superblock.zig");
32
- const BlockType = @import("../lsm/grid.zig").BlockType;
33
- const stdx = @import("../stdx.zig");
34
- const PriorityQueue = @import("./priority_queue.zig").PriorityQueue;
35
- const fuzz = @import("./fuzz.zig");
36
- const hash_log = @import("./hash_log.zig");
37
-
38
- const log = std.log.scoped(.storage);
39
-
40
- // TODOs:
41
- // less than a majority of replicas may have corruption
42
- // have an option to enable/disable the following corruption types:
43
- // bitrot
44
- // misdirected read/write
45
- // corrupt sector
46
- // latent sector error
47
- // - emulate by zeroing sector, as this is how we handle this in the real Storage implementation
48
- // - likely that surrounding sectors also corrupt
49
- // - likely that stuff written at the same time is also corrupt even if written to a far away sector
50
- pub const Storage = struct {
51
- /// Options for fault injection during fuzz testing
52
- pub const Options = struct {
53
- /// Seed for the storage PRNG.
54
- seed: u64 = 0,
55
-
56
- /// Required when `fault_atlas` is set.
57
- replica_index: ?u8 = null,
58
-
59
- /// Minimum number of ticks it may take to read data.
60
- read_latency_min: u64,
61
- /// Average number of ticks it may take to read data. Must be >= read_latency_min.
62
- read_latency_mean: u64,
63
- /// Minimum number of ticks it may take to write data.
64
- write_latency_min: u64,
65
- /// Average number of ticks it may take to write data. Must be >= write_latency_min.
66
- write_latency_mean: u64,
67
-
68
- /// Chance out of 100 that a read will corrupt a sector, if the target memory is within
69
- /// a faulty area of this replica.
70
- read_fault_probability: u8 = 0,
71
- /// Chance out of 100 that a write will corrupt a sector, if the target memory is within
72
- /// a faulty area of this replica.
73
- write_fault_probability: u8 = 0,
74
- /// Chance out of 100 that a crash will corrupt a sector of a pending write's target,
75
- /// if the target memory is within a faulty area of this replica.
76
- crash_fault_probability: u8 = 0,
77
-
78
- /// Enable/disable automatic read/write faults.
79
- /// Does not impact crash faults or manual faults.
80
- fault_atlas: ?*const ClusterFaultAtlas = null,
81
- };
82
-
83
- /// See usage in Journal.write_sectors() for details.
84
- /// TODO: allow testing in both modes.
85
- pub const synchronicity: enum {
86
- always_synchronous,
87
- always_asynchronous,
88
- } = .always_asynchronous;
89
-
90
- pub const Read = struct {
91
- callback: fn (read: *Storage.Read) void,
92
- buffer: []u8,
93
- zone: vsr.Zone,
94
- /// Relative offset within the zone.
95
- offset: u64,
96
- /// Tick at which this read is considered "completed" and the callback should be called.
97
- done_at_tick: u64,
98
-
99
- fn less_than(context: void, a: *Read, b: *Read) math.Order {
100
- _ = context;
101
-
102
- return math.order(a.done_at_tick, b.done_at_tick);
103
- }
104
- };
105
-
106
- pub const Write = struct {
107
- callback: fn (write: *Storage.Write) void,
108
- buffer: []const u8,
109
- zone: vsr.Zone,
110
- /// Relative offset within the zone.
111
- offset: u64,
112
- /// Tick at which this write is considered "completed" and the callback should be called.
113
- done_at_tick: u64,
114
-
115
- fn less_than(context: void, a: *Write, b: *Write) math.Order {
116
- _ = context;
117
-
118
- return math.order(a.done_at_tick, b.done_at_tick);
119
- }
120
- };
121
-
122
- pub const NextTick = struct {
123
- next: ?*NextTick = null,
124
- callback: fn (next_tick: *NextTick) void,
125
- };
126
-
127
- allocator: mem.Allocator,
128
-
129
- size: u64,
130
- options: Options,
131
- prng: std.rand.DefaultPrng,
132
-
133
- memory: []align(constants.sector_size) u8,
134
- /// Set bits correspond to sectors that have ever been written to.
135
- memory_written: std.DynamicBitSetUnmanaged,
136
- /// Set bits correspond to faulty sectors. The underlying sectors of `memory` is left clean.
137
- faults: std.DynamicBitSetUnmanaged,
138
-
139
- /// Whether to enable faults (when false, this supersedes `faulty_wal_areas`).
140
- /// This is used to disable faults during the replica's first startup.
141
- faulty: bool = true,
142
-
143
- reads: PriorityQueue(*Storage.Read, void, Storage.Read.less_than),
144
- writes: PriorityQueue(*Storage.Write, void, Storage.Write.less_than),
145
-
146
- ticks: u64 = 0,
147
- next_tick_queue: FIFO(NextTick) = .{},
148
-
149
- pub fn init(allocator: mem.Allocator, size: u64, options: Storage.Options) !Storage {
150
- assert(options.write_latency_mean >= options.write_latency_min);
151
- assert(options.read_latency_mean >= options.read_latency_min);
152
- assert(options.fault_atlas == null or options.replica_index != null);
153
-
154
- var prng = std.rand.DefaultPrng.init(options.seed);
155
- const sector_count = @divExact(size, constants.sector_size);
156
- const memory = try allocator.allocAdvanced(u8, constants.sector_size, size, .exact);
157
- errdefer allocator.free(memory);
158
- // TODO: random data
159
- mem.set(u8, memory, 0);
160
-
161
- var memory_written = try std.DynamicBitSetUnmanaged.initEmpty(allocator, sector_count);
162
- errdefer memory_written.deinit(allocator);
163
-
164
- var faults = try std.DynamicBitSetUnmanaged.initEmpty(allocator, sector_count);
165
- errdefer faults.deinit(allocator);
166
-
167
- var reads = PriorityQueue(*Storage.Read, void, Storage.Read.less_than).init(allocator, {});
168
- errdefer reads.deinit();
169
- try reads.ensureTotalCapacity(constants.iops_read_max);
170
-
171
- var writes = PriorityQueue(*Storage.Write, void, Storage.Write.less_than).init(allocator, {});
172
- errdefer writes.deinit();
173
- try writes.ensureTotalCapacity(constants.iops_write_max);
174
-
175
- return Storage{
176
- .allocator = allocator,
177
- .size = size,
178
- .options = options,
179
- .prng = prng,
180
- .memory = memory,
181
- .memory_written = memory_written,
182
- .faults = faults,
183
- .reads = reads,
184
- .writes = writes,
185
- };
186
- }
187
-
188
- pub fn deinit(storage: *Storage, allocator: mem.Allocator) void {
189
- allocator.free(storage.memory);
190
- storage.memory_written.deinit(allocator);
191
- storage.faults.deinit(allocator);
192
- storage.reads.deinit();
193
- storage.writes.deinit();
194
- }
195
-
196
- /// Cancel any currently in-progress reads/writes.
197
- /// Corrupt the target sectors of any in-progress writes.
198
- pub fn reset(storage: *Storage) void {
199
- while (storage.writes.peek()) |_| {
200
- const write = storage.writes.remove();
201
- if (!storage.x_in_100(storage.options.crash_fault_probability)) continue;
202
-
203
- // Randomly corrupt one of the faulty sectors the operation targeted.
204
- // TODO: inject more realistic and varied storage faults as described above.
205
- const sectors = SectorRange.from_zone(write.zone, write.offset, write.buffer.len);
206
- storage.fault_sector(write.zone, sectors.random(storage.prng.random()));
207
- }
208
- assert(storage.writes.len == 0);
209
-
210
- storage.reads.len = 0;
211
- storage.next_tick_queue = .{};
212
- }
213
-
214
- /// Returns the number of bytes that have been written to, assuming that (the simulated)
215
- /// `fallocate()` creates a sparse file.
216
- pub fn size_used(storage: *const Storage) usize {
217
- return storage.memory_written.count() * constants.sector_size;
218
- }
219
-
220
- /// Copy state from `origin` to `storage`:
221
- ///
222
- /// - ticks
223
- /// - memory
224
- /// - occupied memory
225
- /// - faulty sectors
226
- /// - reads in-progress
227
- /// - writes in-progress
228
- ///
229
- /// Both instances must have an identical size.
230
- pub fn copy(storage: *Storage, origin: *const Storage) void {
231
- assert(storage.size == origin.size);
232
-
233
- storage.ticks = origin.ticks;
234
- stdx.copy_disjoint(.exact, u8, storage.memory, origin.memory);
235
- storage.memory_written.toggleSet(storage.memory_written);
236
- storage.memory_written.toggleSet(origin.memory_written);
237
- storage.faults.toggleSet(storage.faults);
238
- storage.faults.toggleSet(origin.faults);
239
-
240
- storage.reads.len = 0;
241
- for (origin.reads.items[0..origin.reads.len]) |read| {
242
- storage.reads.add(read) catch unreachable;
243
- }
244
-
245
- storage.writes.len = 0;
246
- for (origin.writes.items[0..origin.writes.len]) |write| {
247
- storage.writes.add(write) catch unreachable;
248
- }
249
- }
250
-
251
- pub fn tick(storage: *Storage) void {
252
- storage.ticks += 1;
253
-
254
- while (storage.reads.peek()) |read| {
255
- if (read.done_at_tick > storage.ticks) break;
256
- _ = storage.reads.remove();
257
- storage.read_sectors_finish(read);
258
- }
259
-
260
- while (storage.writes.peek()) |write| {
261
- if (write.done_at_tick > storage.ticks) break;
262
- _ = storage.writes.remove();
263
- storage.write_sectors_finish(write);
264
- }
265
-
266
- while (storage.next_tick_queue.pop()) |next_tick| {
267
- next_tick.callback(next_tick);
268
- }
269
- }
270
-
271
- pub fn on_next_tick(
272
- storage: *Storage,
273
- callback: fn (next_tick: *Storage.NextTick) void,
274
- next_tick: *Storage.NextTick,
275
- ) void {
276
- next_tick.* = .{ .callback = callback };
277
- storage.next_tick_queue.push(next_tick);
278
- }
279
-
280
- /// * Verifies that the read fits within the target sector.
281
- /// * Verifies that the read targets sectors that have been written to.
282
- pub fn read_sectors(
283
- storage: *Storage,
284
- callback: fn (read: *Storage.Read) void,
285
- read: *Storage.Read,
286
- buffer: []u8,
287
- zone: vsr.Zone,
288
- offset_in_zone: u64,
289
- ) void {
290
- hash_log.emit_autohash(.{ buffer, zone, offset_in_zone }, .DeepRecursive);
291
-
292
- verify_alignment(buffer);
293
-
294
- var sectors = SectorRange.from_zone(zone, offset_in_zone, buffer.len);
295
- while (sectors.next()) |sector| assert(storage.memory_written.isSet(sector));
296
-
297
- read.* = .{
298
- .callback = callback,
299
- .buffer = buffer,
300
- .zone = zone,
301
- .offset = offset_in_zone,
302
- .done_at_tick = storage.ticks + storage.read_latency(),
303
- };
304
-
305
- // We ensure the capacity is sufficient for constants.iops_read_max in init()
306
- storage.reads.add(read) catch unreachable;
307
- }
308
-
309
- fn read_sectors_finish(storage: *Storage, read: *Storage.Read) void {
310
- hash_log.emit_autohash(.{ read.buffer, read.zone, read.offset }, .DeepRecursive);
311
-
312
- const offset_in_storage = read.zone.offset(read.offset);
313
- stdx.copy_disjoint(
314
- .exact,
315
- u8,
316
- read.buffer,
317
- storage.memory[offset_in_storage..][0..read.buffer.len],
318
- );
319
-
320
- if (storage.x_in_100(storage.options.read_fault_probability)) {
321
- storage.fault_faulty_sectors(read.zone, read.offset, read.buffer.len);
322
- }
323
-
324
- if (storage.faulty) {
325
- // Corrupt faulty sectors.
326
- var sectors = SectorRange.from_zone(read.zone, read.offset, read.buffer.len);
327
- const sectors_min = sectors.min;
328
- while (sectors.next()) |sector| {
329
- if (storage.faults.isSet(sector)) {
330
- const faulty_sector_offset = (sector - sectors_min) * constants.sector_size;
331
- const faulty_sector_bytes = read.buffer[faulty_sector_offset..][0..constants.sector_size];
332
- storage.prng.random().bytes(faulty_sector_bytes);
333
- }
334
- }
335
- }
336
-
337
- read.callback(read);
338
- }
339
-
340
- pub fn write_sectors(
341
- storage: *Storage,
342
- callback: fn (write: *Storage.Write) void,
343
- write: *Storage.Write,
344
- buffer: []const u8,
345
- zone: vsr.Zone,
346
- offset_in_zone: u64,
347
- ) void {
348
- hash_log.emit_autohash(.{ buffer, zone, offset_in_zone }, .DeepRecursive);
349
-
350
- verify_alignment(buffer);
351
-
352
- // Verify that there are no concurrent overlapping writes.
353
- var iterator = storage.writes.iterator();
354
- while (iterator.next()) |other| {
355
- if (other.zone != zone) continue;
356
- assert(offset_in_zone + buffer.len <= other.offset or
357
- other.offset + other.buffer.len <= offset_in_zone);
358
- }
359
-
360
- write.* = .{
361
- .callback = callback,
362
- .buffer = buffer,
363
- .zone = zone,
364
- .offset = offset_in_zone,
365
- .done_at_tick = storage.ticks + storage.write_latency(),
366
- };
367
-
368
- // We ensure the capacity is sufficient for constants.iops_write_max in init()
369
- storage.writes.add(write) catch unreachable;
370
- }
371
-
372
- fn write_sectors_finish(storage: *Storage, write: *Storage.Write) void {
373
- hash_log.emit_autohash(.{ write.buffer, write.zone, write.offset }, .DeepRecursive);
374
-
375
- const offset_in_storage = write.zone.offset(write.offset);
376
- stdx.copy_disjoint(
377
- .exact,
378
- u8,
379
- storage.memory[offset_in_storage..][0..write.buffer.len],
380
- write.buffer,
381
- );
382
-
383
- var sectors = SectorRange.from_zone(write.zone, write.offset, write.buffer.len);
384
- while (sectors.next()) |sector| {
385
- storage.faults.unset(sector);
386
- storage.memory_written.set(sector);
387
- }
388
-
389
- if (storage.x_in_100(storage.options.write_fault_probability)) {
390
- storage.fault_faulty_sectors(write.zone, write.offset, write.buffer.len);
391
- }
392
-
393
- write.callback(write);
394
- }
395
-
396
- fn read_latency(storage: *Storage) u64 {
397
- return storage.latency(storage.options.read_latency_min, storage.options.read_latency_mean);
398
- }
399
-
400
- fn write_latency(storage: *Storage) u64 {
401
- return storage.latency(storage.options.write_latency_min, storage.options.write_latency_mean);
402
- }
403
-
404
- fn latency(storage: *Storage, min: u64, mean: u64) u64 {
405
- return min + fuzz.random_int_exponential(storage.prng.random(), u64, mean - min);
406
- }
407
-
408
- /// Return true with probability x/100.
409
- fn x_in_100(storage: *Storage, x: u8) bool {
410
- assert(x <= 100);
411
- return x > storage.prng.random().uintLessThan(u8, 100);
412
- }
413
-
414
- fn fault_faulty_sectors(storage: *Storage, zone: vsr.Zone, offset_in_zone: u64, size: u64) void {
415
- const atlas = storage.options.fault_atlas orelse return;
416
- const replica_index = storage.options.replica_index.?;
417
- const faulty_sectors = switch (zone) {
418
- .superblock => atlas.faulty_superblock(replica_index, offset_in_zone, size),
419
- .wal_headers => atlas.faulty_wal_headers(replica_index, offset_in_zone, size),
420
- .wal_prepares => atlas.faulty_wal_prepares(replica_index, offset_in_zone, size),
421
- .grid => null,
422
- } orelse return;
423
-
424
- // Randomly corrupt one of the faulty sectors the operation targeted.
425
- // TODO: inject more realistic and varied storage faults as described above.
426
- storage.fault_sector(zone, faulty_sectors.random(storage.prng.random()));
427
- }
428
-
429
- fn fault_sector(storage: *Storage, zone: vsr.Zone, sector: usize) void {
430
- storage.faults.set(sector);
431
- if (storage.options.replica_index) |replica_index| {
432
- log.debug("{}: corrupting sector at zone={} offset={}", .{
433
- replica_index,
434
- zone,
435
- sector * constants.sector_size - zone.offset(0),
436
- });
437
- }
438
- }
439
-
440
- pub fn superblock_header(
441
- storage: *const Storage,
442
- copy_: u8,
443
- ) *const superblock.SuperBlockHeader {
444
- const offset = vsr.Zone.superblock.offset(superblock.areas.header.offset(copy_));
445
- const bytes = storage.memory[offset..][0..superblock.areas.header.size_max];
446
- return mem.bytesAsValue(superblock.SuperBlockHeader, bytes);
447
- }
448
-
449
- pub fn wal_headers(storage: *const Storage) []const vsr.Header {
450
- const offset = vsr.Zone.wal_headers.offset(0);
451
- const size = vsr.Zone.wal_headers.size().?;
452
- return mem.bytesAsSlice(vsr.Header, storage.memory[offset..][0..size]);
453
- }
454
-
455
- const MessageRaw = extern struct {
456
- header: vsr.Header,
457
- body: [constants.message_size_max - @sizeOf(vsr.Header)]u8,
458
-
459
- comptime {
460
- assert(@sizeOf(MessageRaw) == constants.message_size_max);
461
- assert(@sizeOf(MessageRaw) * 8 == @bitSizeOf(MessageRaw));
462
- }
463
- };
464
-
465
- pub fn wal_prepares(storage: *const Storage) []const MessageRaw {
466
- const offset = vsr.Zone.wal_prepares.offset(0);
467
- const size = vsr.Zone.wal_prepares.size().?;
468
- return mem.bytesAsSlice(MessageRaw, storage.memory[offset..][0..size]);
469
- }
470
-
471
- pub fn grid_block(
472
- storage: *const Storage,
473
- address: u64,
474
- ) *align(constants.sector_size) [constants.block_size]u8 {
475
- assert(address > 0);
476
-
477
- const block_offset = vsr.Zone.grid.offset((address - 1) * constants.block_size);
478
- const block_header = mem.bytesToValue(
479
- vsr.Header,
480
- storage.memory[block_offset..][0..@sizeOf(vsr.Header)],
481
- );
482
- assert(storage.memory_written.isSet(@divExact(block_offset, constants.sector_size)));
483
- assert(block_header.valid_checksum());
484
- assert(block_header.size <= constants.block_size);
485
-
486
- return storage.memory[block_offset..][0..constants.block_size];
487
- }
488
- };
489
-
490
- fn verify_alignment(buffer: []const u8) void {
491
- assert(buffer.len > 0);
492
-
493
- // Ensure that the read or write is aligned correctly for Direct I/O:
494
- // If this is not the case, the underlying syscall will return EINVAL.
495
- assert(@mod(@ptrToInt(buffer.ptr), constants.sector_size) == 0);
496
- assert(@mod(buffer.len, constants.sector_size) == 0);
497
- }
498
-
499
- pub const Area = union(enum) {
500
- superblock: struct { area: superblock.Area, copy: u8 },
501
- wal_headers: struct { sector: usize },
502
- wal_prepares: struct { slot: usize },
503
- grid: struct { address: u64 },
504
-
505
- fn sectors(area: Area) SectorRange {
506
- switch (area) {
507
- .superblock => |data| SectorRange.from_zone(
508
- .superblock,
509
- @field(superblock.areas, data.area).offset(data.copy),
510
- @field(superblock.areas, data.area).size_max,
511
- ),
512
- .wal_headers => |data| SectorRange.from_zone(
513
- .wal_headers,
514
- constants.sector_size * data.sector,
515
- constants.sector_size,
516
- ),
517
- .wal_prepares => |data| SectorRange.from_zone(
518
- .wal_prepares,
519
- constants.message_size_max * data.slot,
520
- constants.message_size_max,
521
- ),
522
- .grid => |data| SectorRange.from_zone(
523
- .grid,
524
- constants.block_size * (data.address - 1),
525
- constants.block_size,
526
- ),
527
- }
528
- }
529
- };
530
-
531
- const SectorRange = struct {
532
- min: usize, // inclusive sector index
533
- max: usize, // exclusive sector index
534
-
535
- fn from_zone(
536
- zone: vsr.Zone,
537
- offset_in_zone: u64,
538
- size: usize,
539
- ) SectorRange {
540
- return from_offset(zone.offset(offset_in_zone), size);
541
- }
542
-
543
- fn from_offset(offset_in_storage: u64, size: usize) SectorRange {
544
- return .{
545
- .min = @divExact(offset_in_storage, constants.sector_size),
546
- .max = @divExact(offset_in_storage + size, constants.sector_size),
547
- };
548
- }
549
-
550
- fn random(range: SectorRange, rand: std.rand.Random) usize {
551
- return range.min + rand.uintLessThan(usize, range.max - range.min);
552
- }
553
-
554
- fn next(range: *SectorRange) ?usize {
555
- if (range.min == range.max) return null;
556
- defer range.min += 1;
557
- return range.min;
558
- }
559
-
560
- fn intersect(a: SectorRange, b: SectorRange) ?SectorRange {
561
- if (a.max <= b.min) return null;
562
- if (b.max <= a.min) return null;
563
- return SectorRange{
564
- .min = std.math.max(a.min, b.min),
565
- .max = std.math.min(a.max, b.max),
566
- };
567
- }
568
- };
569
-
570
- /// To ensure the cluster can recover, each header/prepare/block must be valid (not faulty) at
571
- /// a majority of replicas.
572
- ///
573
- /// We can't allow WAL storage faults for the same message in a majority of
574
- /// the replicas as that would make recovery impossible. Instead, we only
575
- /// allow faults in certain areas which differ between replicas.
576
- // TODO Support total superblock corruption, forcing a full state transfer.
577
- pub const ClusterFaultAtlas = struct {
578
- pub const Options = struct {
579
- faulty_superblock: bool,
580
- faulty_wal_headers: bool,
581
- faulty_wal_prepares: bool,
582
- // TODO grid
583
- };
584
-
585
- /// This is the maximum number of faults per-trailer-area that can be safely injected on a read
586
- /// or write to the superblock zone.
587
- ///
588
- /// It does not include the additional "torn write" fault injected upon a crash.
589
- ///
590
- /// For SuperBlockHeader, checkpoint() and view_change() require 3/4 valid headers (1
591
- /// fault). Trailers are likewise 3/4 + 1 fault — consider if two faults were injected:
592
- /// 1. `SuperBlock.checkpoint()` for sequence=6.
593
- /// - write copy 0, corrupt manifest (fault_count=1)
594
- /// - write copy 1, corrupt manifest (fault_count=2) !
595
- /// 2. Crash. Recover.
596
- /// 3. `SuperBlock.open()`. The highest valid quorum is sequence=6, but there is no
597
- /// valid manifest.
598
- const superblock_trailer_faults_max = @divExact(constants.superblock_copies, 2) - 1;
599
-
600
- comptime {
601
- assert(superblock_trailer_faults_max >= 1);
602
- }
603
-
604
- const CopySet = std.StaticBitSet(constants.superblock_copies);
605
- const ReplicaSet = std.StaticBitSet(constants.replicas_max);
606
- const headers_per_sector = @divExact(constants.sector_size, @sizeOf(vsr.Header));
607
- const header_sectors = @divExact(constants.journal_slot_count, headers_per_sector);
608
-
609
- const FaultySuperBlockAreas = std.enums.EnumArray(superblock.Area, CopySet);
610
- const FaultyWALHeaders = std.StaticBitSet(@divExact(
611
- constants.journal_size_headers,
612
- constants.sector_size,
613
- ));
614
-
615
- options: Options,
616
- faulty_superblock_areas: FaultySuperBlockAreas =
617
- FaultySuperBlockAreas.initFill(CopySet.initEmpty()),
618
- faulty_wal_header_sectors: [constants.replicas_max]FaultyWALHeaders =
619
- [_]FaultyWALHeaders{FaultyWALHeaders.initEmpty()} ** constants.replicas_max,
620
-
621
- pub fn init(replica_count: u8, random: std.rand.Random, options: Options) ClusterFaultAtlas {
622
- // If there is only one replica in the cluster, WAL/Grid faults are not recoverable.
623
- assert(replica_count > 1 or options.faulty_wal_headers == false);
624
- assert(replica_count > 1 or options.faulty_wal_prepares == false);
625
-
626
- var atlas = ClusterFaultAtlas{ .options = options };
627
-
628
- for (&atlas.faulty_superblock_areas.values) |*copies, area| {
629
- if (area == @enumToInt(superblock.Area.header)) {
630
- // Only inject read/write faults into trailers, not the header.
631
- // This prevents the quorum from being lost like so:
632
- // - copy₀: B (ok)
633
- // - copy₁: B (torn write)
634
- // - copy₂: A (corrupt)
635
- // - copy₃: A (ok)
636
- } else {
637
- var area_faults: usize = 0;
638
- while (area_faults < superblock_trailer_faults_max) : (area_faults += 1) {
639
- copies.set(random.uintLessThan(usize, constants.superblock_copies));
640
- }
641
- }
642
- }
643
-
644
- // A cluster-of-2 is special-cased to mirror the special case in replica.zig.
645
- // See repair_prepare()/on_nack_prepare().
646
- const quorums = vsr.quorums(replica_count);
647
- const faults_max = if (replica_count == 2) 1 else replica_count - quorums.replication;
648
- assert(faults_max < replica_count);
649
- assert(faults_max > 0 or replica_count == 1);
650
-
651
- var wal_header_sectors = [_]ReplicaSet{ReplicaSet.initEmpty()} ** header_sectors;
652
- for (wal_header_sectors) |*wal_header_sector, sector| {
653
- while (wal_header_sector.count() < faults_max) {
654
- const replica_index = random.uintLessThan(u8, replica_count);
655
- wal_header_sector.set(replica_index);
656
- atlas.faulty_wal_header_sectors[replica_index].set(sector);
657
- }
658
- }
659
-
660
- return atlas;
661
- }
662
-
663
- /// Returns a range of faulty sectors which intersect the specified range.
664
- fn faulty_superblock(
665
- atlas: ClusterFaultAtlas,
666
- replica_index: usize,
667
- offset_in_zone: u64,
668
- size: u64,
669
- ) ?SectorRange {
670
- _ = replica_index;
671
- if (!atlas.options.faulty_superblock) return null;
672
-
673
- const copy = @divFloor(offset_in_zone, superblock.superblock_copy_size);
674
- const offset_in_copy = offset_in_zone % superblock.superblock_copy_size;
675
- const area: superblock.Area = switch (offset_in_copy) {
676
- superblock.areas.header.base => .header,
677
- superblock.areas.manifest.base => .manifest,
678
- superblock.areas.free_set.base => .free_set,
679
- superblock.areas.client_table.base => .client_table,
680
- else => unreachable,
681
- };
682
-
683
- if (atlas.faulty_superblock_areas.get(area).isSet(copy)) {
684
- return SectorRange.from_zone(.superblock, offset_in_zone, size);
685
- } else {
686
- return null;
687
- }
688
- }
689
-
690
- /// Returns a range of faulty sectors which intersect the specified range.
691
- fn faulty_wal_headers(
692
- atlas: ClusterFaultAtlas,
693
- replica_index: usize,
694
- offset_in_zone: u64,
695
- size: u64,
696
- ) ?SectorRange {
697
- if (!atlas.options.faulty_wal_headers) return null;
698
- return faulty_sectors(
699
- FaultyWALHeaders.bit_length,
700
- constants.sector_size,
701
- .wal_headers,
702
- &atlas.faulty_wal_header_sectors[replica_index],
703
- offset_in_zone,
704
- size,
705
- );
706
- }
707
-
708
- /// Returns a range of faulty sectors which intersect the specified range.
709
- fn faulty_wal_prepares(
710
- atlas: ClusterFaultAtlas,
711
- replica_index: usize,
712
- offset_in_zone: u64,
713
- size: u64,
714
- ) ?SectorRange {
715
- if (!atlas.options.faulty_wal_prepares) return null;
716
- return faulty_sectors(
717
- FaultyWALHeaders.bit_length,
718
- constants.message_size_max * headers_per_sector,
719
- .wal_prepares,
720
- &atlas.faulty_wal_header_sectors[replica_index],
721
- offset_in_zone,
722
- size,
723
- );
724
- }
725
-
726
- fn faulty_sectors(
727
- comptime chunk_count: usize,
728
- comptime chunk_size: usize,
729
- comptime zone: vsr.Zone,
730
- faulty_chunks: *const std.StaticBitSet(chunk_count),
731
- offset_in_zone: u64,
732
- size: u64,
733
- ) ?SectorRange {
734
- var fault_start: ?usize = null;
735
- var fault_count: usize = 0;
736
-
737
- var chunk: usize = @divFloor(offset_in_zone, chunk_size);
738
- while (chunk * chunk_size < offset_in_zone + size) : (chunk += 1) {
739
- if (faulty_chunks.isSet(chunk)) {
740
- if (fault_start == null) fault_start = chunk;
741
- fault_count += 1;
742
- } else {
743
- if (fault_start != null) break;
744
- }
745
- }
746
-
747
- if (fault_start) |start| {
748
- return SectorRange.from_zone(
749
- zone,
750
- chunk_size * start,
751
- chunk_size * fault_count,
752
- ).intersect(SectorRange.from_zone(zone, offset_in_zone, size)).?;
753
- } else {
754
- return null;
755
- }
756
- }
757
- };