tigerbeetle-node 0.9.0 → 0.9.143
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +580 -179
- package/dist/benchmark.js +44 -36
- package/dist/benchmark.js.map +1 -1
- package/dist/bin/aarch64-linux-gnu/client.node +0 -0
- package/dist/bin/aarch64-linux-musl/client.node +0 -0
- package/dist/bin/aarch64-macos/client.node +0 -0
- package/dist/bin/x86_64-linux-gnu/client.node +0 -0
- package/dist/bin/x86_64-linux-musl/client.node +0 -0
- package/dist/bin/x86_64-macos/client.node +0 -0
- package/dist/bin/x86_64-windows/client.node +0 -0
- package/dist/bindings.d.ts +141 -0
- package/dist/bindings.js +112 -0
- package/dist/bindings.js.map +1 -0
- package/dist/index.d.ts +2 -125
- package/dist/index.js +51 -101
- package/dist/index.js.map +1 -1
- package/dist/test.js +68 -54
- package/dist/test.js.map +1 -1
- package/package-lock.json +26 -0
- package/package.json +13 -22
- package/src/benchmark.ts +58 -49
- package/src/bindings.ts +631 -0
- package/src/index.ts +71 -163
- package/src/node.zig +169 -148
- package/src/test.ts +71 -57
- package/src/translate.zig +19 -36
- package/scripts/download_node_headers.sh +0 -25
- package/src/tigerbeetle/scripts/benchmark.bat +0 -46
- package/src/tigerbeetle/scripts/benchmark.sh +0 -55
- package/src/tigerbeetle/scripts/install.sh +0 -6
- package/src/tigerbeetle/scripts/install_zig.bat +0 -109
- package/src/tigerbeetle/scripts/install_zig.sh +0 -84
- package/src/tigerbeetle/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -39
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
- package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/src/benchmark.zig +0 -290
- package/src/tigerbeetle/src/cli.zig +0 -244
- package/src/tigerbeetle/src/config.zig +0 -239
- package/src/tigerbeetle/src/demo.zig +0 -125
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
- package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -24
- package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
- package/src/tigerbeetle/src/fifo.zig +0 -104
- package/src/tigerbeetle/src/io/benchmark.zig +0 -213
- package/src/tigerbeetle/src/io/darwin.zig +0 -793
- package/src/tigerbeetle/src/io/linux.zig +0 -1038
- package/src/tigerbeetle/src/io/test.zig +0 -643
- package/src/tigerbeetle/src/io/windows.zig +0 -1161
- package/src/tigerbeetle/src/io.zig +0 -34
- package/src/tigerbeetle/src/main.zig +0 -144
- package/src/tigerbeetle/src/message_bus.zig +0 -1000
- package/src/tigerbeetle/src/message_pool.zig +0 -142
- package/src/tigerbeetle/src/ring_buffer.zig +0 -289
- package/src/tigerbeetle/src/simulator.zig +0 -417
- package/src/tigerbeetle/src/state_machine.zig +0 -2470
- package/src/tigerbeetle/src/storage.zig +0 -308
- package/src/tigerbeetle/src/test/cluster.zig +0 -351
- package/src/tigerbeetle/src/test/message_bus.zig +0 -93
- package/src/tigerbeetle/src/test/network.zig +0 -179
- package/src/tigerbeetle/src/test/packet_simulator.zig +0 -387
- package/src/tigerbeetle/src/test/state_checker.zig +0 -145
- package/src/tigerbeetle/src/test/state_machine.zig +0 -76
- package/src/tigerbeetle/src/test/storage.zig +0 -438
- package/src/tigerbeetle/src/test/time.zig +0 -84
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -222
- package/src/tigerbeetle/src/time.zig +0 -113
- package/src/tigerbeetle/src/unit_tests.zig +0 -14
- package/src/tigerbeetle/src/vsr/client.zig +0 -505
- package/src/tigerbeetle/src/vsr/clock.zig +0 -812
- package/src/tigerbeetle/src/vsr/journal.zig +0 -2293
- package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
- package/src/tigerbeetle/src/vsr/replica.zig +0 -5015
- package/src/tigerbeetle/src/vsr.zig +0 -1017
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
const std = @import("std");
|
|
2
|
-
const assert = std.debug.assert;
|
|
3
|
-
const mem = std.mem;
|
|
4
|
-
|
|
5
|
-
const config = @import("../config.zig");
|
|
6
|
-
|
|
7
|
-
const Cluster = @import("cluster.zig").Cluster;
|
|
8
|
-
const Network = @import("network.zig").Network;
|
|
9
|
-
const StateMachine = @import("state_machine.zig").StateMachine;
|
|
10
|
-
|
|
11
|
-
const message_pool = @import("../message_pool.zig");
|
|
12
|
-
const MessagePool = message_pool.MessagePool;
|
|
13
|
-
const Message = MessagePool.Message;
|
|
14
|
-
|
|
15
|
-
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
16
|
-
|
|
17
|
-
const RequestQueue = RingBuffer(u128, config.client_request_queue_max);
|
|
18
|
-
const StateTransitions = std.AutoHashMap(u128, u64);
|
|
19
|
-
|
|
20
|
-
const log = std.log.scoped(.state_checker);
|
|
21
|
-
|
|
22
|
-
pub const StateChecker = struct {
|
|
23
|
-
/// Indexed by client index as used by Cluster.
|
|
24
|
-
client_requests: [config.clients_max]RequestQueue =
|
|
25
|
-
[_]RequestQueue{.{}} ** config.clients_max,
|
|
26
|
-
|
|
27
|
-
/// Indexed by replica index.
|
|
28
|
-
state_machine_states: [config.replicas_max]u128,
|
|
29
|
-
|
|
30
|
-
history: StateTransitions,
|
|
31
|
-
|
|
32
|
-
/// The highest cannonical state reached by the cluster.
|
|
33
|
-
state: u128,
|
|
34
|
-
|
|
35
|
-
/// The number of times the cannonical state has been advanced.
|
|
36
|
-
transitions: u64 = 0,
|
|
37
|
-
|
|
38
|
-
pub fn init(allocator: mem.Allocator, cluster: *Cluster) !StateChecker {
|
|
39
|
-
const state = cluster.state_machines[0].state;
|
|
40
|
-
|
|
41
|
-
var state_machine_states: [config.replicas_max]u128 = undefined;
|
|
42
|
-
for (cluster.state_machines) |state_machine, i| {
|
|
43
|
-
assert(state_machine.state == state);
|
|
44
|
-
state_machine_states[i] = state_machine.state;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
var history = StateTransitions.init(allocator);
|
|
48
|
-
errdefer history.deinit();
|
|
49
|
-
|
|
50
|
-
var state_checker = StateChecker{
|
|
51
|
-
.state_machine_states = state_machine_states,
|
|
52
|
-
.history = history,
|
|
53
|
-
.state = state,
|
|
54
|
-
};
|
|
55
|
-
|
|
56
|
-
try state_checker.history.putNoClobber(state, state_checker.transitions);
|
|
57
|
-
|
|
58
|
-
return state_checker;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
pub fn deinit(state_checker: *StateChecker) void {
|
|
62
|
-
state_checker.history.deinit();
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
pub fn check_state(state_checker: *StateChecker, replica: u8) void {
|
|
66
|
-
const cluster = @fieldParentPtr(Cluster, "state_checker", state_checker);
|
|
67
|
-
|
|
68
|
-
const a = state_checker.state_machine_states[replica];
|
|
69
|
-
const b = cluster.state_machines[replica].state;
|
|
70
|
-
|
|
71
|
-
if (b == a) return;
|
|
72
|
-
state_checker.state_machine_states[replica] = b;
|
|
73
|
-
|
|
74
|
-
// If some other replica has already reached this state, then it will be in the history:
|
|
75
|
-
if (state_checker.history.get(b)) |transition| {
|
|
76
|
-
// A replica may transition more than once to the same state, for example, when
|
|
77
|
-
// restarting after a crash and replaying the log. The more important invariant is that
|
|
78
|
-
// the cluster as a whole may not transition to the same state more than once, and once
|
|
79
|
-
// transitioned may not regress.
|
|
80
|
-
log.info(
|
|
81
|
-
"{d:0>4}/{d:0>4} {x:0>32} > {x:0>32} {}",
|
|
82
|
-
.{ transition, state_checker.transitions, a, b, replica },
|
|
83
|
-
);
|
|
84
|
-
return;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// The replica has transitioned to state `b` that is not yet in the history.
|
|
88
|
-
// Check if this is a valid new state based on all currently inflight client requests.
|
|
89
|
-
for (state_checker.client_requests) |*queue| {
|
|
90
|
-
if (queue.head_ptr()) |input| {
|
|
91
|
-
if (b == StateMachine.hash(state_checker.state, std.mem.asBytes(input))) {
|
|
92
|
-
const transitions_executed = state_checker.history.get(a).?;
|
|
93
|
-
if (transitions_executed < state_checker.transitions) {
|
|
94
|
-
@panic("replica skipped interim transitions");
|
|
95
|
-
} else {
|
|
96
|
-
assert(transitions_executed == state_checker.transitions);
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
state_checker.state = b;
|
|
100
|
-
state_checker.transitions += 1;
|
|
101
|
-
|
|
102
|
-
log.info(" {d:0>4} {x:0>32} > {x:0>32} {}", .{
|
|
103
|
-
state_checker.transitions,
|
|
104
|
-
a,
|
|
105
|
-
b,
|
|
106
|
-
replica,
|
|
107
|
-
});
|
|
108
|
-
|
|
109
|
-
state_checker.history.putNoClobber(b, state_checker.transitions) catch {
|
|
110
|
-
@panic("state checker unable to allocate memory for history.put()");
|
|
111
|
-
};
|
|
112
|
-
|
|
113
|
-
// As soon as we reach a valid state we must pop the inflight request.
|
|
114
|
-
// We cannot wait until the client receives the reply because that would allow
|
|
115
|
-
// the inflight request to be used to reach other states in the interim.
|
|
116
|
-
// We must therefore use our own queue rather than the clients' queues.
|
|
117
|
-
_ = queue.pop();
|
|
118
|
-
return;
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
@panic("replica transitioned to an invalid state");
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
pub fn convergence(state_checker: *StateChecker) bool {
|
|
127
|
-
const cluster = @fieldParentPtr(Cluster, "state_checker", state_checker);
|
|
128
|
-
|
|
129
|
-
const a = state_checker.state_machine_states[0];
|
|
130
|
-
for (state_checker.state_machine_states[1..cluster.options.replica_count]) |b| {
|
|
131
|
-
if (b != a) return false;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
const transitions_executed = state_checker.history.get(a).?;
|
|
135
|
-
if (transitions_executed < state_checker.transitions) {
|
|
136
|
-
// Cluster reached convergence but on a regressed state.
|
|
137
|
-
// A replica reached the transition limit, crashed, then repaired.
|
|
138
|
-
return false;
|
|
139
|
-
} else {
|
|
140
|
-
assert(transitions_executed == state_checker.transitions);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
return true;
|
|
144
|
-
}
|
|
145
|
-
};
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
const std = @import("std");
|
|
2
|
-
const assert = std.debug.assert;
|
|
3
|
-
|
|
4
|
-
const log = std.log.scoped(.state_machine);
|
|
5
|
-
|
|
6
|
-
pub const StateMachine = struct {
|
|
7
|
-
pub const Operation = enum(u8) {
|
|
8
|
-
/// Operations reserved by VR protocol (for all state machines):
|
|
9
|
-
reserved,
|
|
10
|
-
root,
|
|
11
|
-
register,
|
|
12
|
-
|
|
13
|
-
hash,
|
|
14
|
-
};
|
|
15
|
-
|
|
16
|
-
state: u128,
|
|
17
|
-
prepare_timestamp: u64 = 0,
|
|
18
|
-
commit_timestamp: u64 = 0,
|
|
19
|
-
|
|
20
|
-
pub fn init(seed: u64) StateMachine {
|
|
21
|
-
return .{ .state = hash(0, std.mem.asBytes(&seed)) };
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
pub fn prepare(
|
|
25
|
-
state_machine: *StateMachine,
|
|
26
|
-
operation: Operation,
|
|
27
|
-
input: []u8,
|
|
28
|
-
) u64 {
|
|
29
|
-
_ = operation;
|
|
30
|
-
_ = input;
|
|
31
|
-
|
|
32
|
-
return state_machine.prepare_timestamp;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
pub fn commit(
|
|
36
|
-
state_machine: *StateMachine,
|
|
37
|
-
client: u128,
|
|
38
|
-
operation: Operation,
|
|
39
|
-
input: []const u8,
|
|
40
|
-
output: []u8,
|
|
41
|
-
) usize {
|
|
42
|
-
switch (operation) {
|
|
43
|
-
.reserved, .root => unreachable,
|
|
44
|
-
.register => return 0,
|
|
45
|
-
|
|
46
|
-
// TODO: instead of always using the first 32 bytes of the output
|
|
47
|
-
// buffer, get tricky and use a random but deterministic slice
|
|
48
|
-
// of it, filling the rest with 0s.
|
|
49
|
-
.hash => {
|
|
50
|
-
// Fold the input into our current state, creating a hash chain.
|
|
51
|
-
// Hash the input with the client ID since small inputs may collide across clients.
|
|
52
|
-
const client_input = hash(client, input);
|
|
53
|
-
const new_state = hash(state_machine.state, std.mem.asBytes(&client_input));
|
|
54
|
-
|
|
55
|
-
log.debug("state={x} input={x} input.len={} new state={x}", .{
|
|
56
|
-
state_machine.state,
|
|
57
|
-
client_input,
|
|
58
|
-
input.len,
|
|
59
|
-
new_state,
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
state_machine.state = new_state;
|
|
63
|
-
std.mem.copy(u8, output, std.mem.asBytes(&state_machine.state));
|
|
64
|
-
return @sizeOf(@TypeOf(state_machine.state));
|
|
65
|
-
},
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
pub fn hash(state: u128, input: []const u8) u128 {
|
|
70
|
-
var key: [32]u8 = [_]u8{0} ** 32;
|
|
71
|
-
std.mem.copy(u8, key[0..16], std.mem.asBytes(&state));
|
|
72
|
-
var target: [32]u8 = undefined;
|
|
73
|
-
std.crypto.hash.Blake3.hash(input, &target, .{ .key = key });
|
|
74
|
-
return @bitCast(u128, target[0..16].*);
|
|
75
|
-
}
|
|
76
|
-
};
|
|
@@ -1,438 +0,0 @@
|
|
|
1
|
-
const std = @import("std");
|
|
2
|
-
const assert = std.debug.assert;
|
|
3
|
-
const math = std.math;
|
|
4
|
-
const mem = std.mem;
|
|
5
|
-
|
|
6
|
-
const config = @import("../config.zig");
|
|
7
|
-
const vsr = @import("../vsr.zig");
|
|
8
|
-
|
|
9
|
-
const log = std.log.scoped(.storage);
|
|
10
|
-
|
|
11
|
-
// TODOs:
|
|
12
|
-
// less than a majority of replicas may have corruption
|
|
13
|
-
// have an option to enable/disable the following corruption types:
|
|
14
|
-
// bitrot
|
|
15
|
-
// misdirected read/write
|
|
16
|
-
// corrupt sector
|
|
17
|
-
// latent sector error
|
|
18
|
-
// - emulate by zeroing sector, as this is how we handle this in the real Storage implementation
|
|
19
|
-
// - likely that surrounding sectors also corrupt
|
|
20
|
-
// - likely that stuff written at the same time is also corrupt even if written to a far away sector
|
|
21
|
-
pub const Storage = struct {
|
|
22
|
-
/// Options for fault injection during fuzz testing
|
|
23
|
-
pub const Options = struct {
|
|
24
|
-
/// Seed for the storage PRNG
|
|
25
|
-
seed: u64,
|
|
26
|
-
|
|
27
|
-
/// Minimum number of ticks it may take to read data.
|
|
28
|
-
read_latency_min: u64,
|
|
29
|
-
/// Average number of ticks it may take to read data. Must be >= read_latency_min.
|
|
30
|
-
read_latency_mean: u64,
|
|
31
|
-
/// Minimum number of ticks it may take to write data.
|
|
32
|
-
write_latency_min: u64,
|
|
33
|
-
/// Average number of ticks it may take to write data. Must be >= write_latency_min.
|
|
34
|
-
write_latency_mean: u64,
|
|
35
|
-
|
|
36
|
-
/// Chance out of 100 that a read will return incorrect data, if the target memory is within
|
|
37
|
-
/// the faulty area of this replica.
|
|
38
|
-
read_fault_probability: u8,
|
|
39
|
-
/// Chance out of 100 that a read will return incorrect data, if the target memory is within
|
|
40
|
-
/// the faulty area of this replica.
|
|
41
|
-
write_fault_probability: u8,
|
|
42
|
-
};
|
|
43
|
-
|
|
44
|
-
/// See usage in Journal.write_sectors() for details.
|
|
45
|
-
/// TODO: allow testing in both modes.
|
|
46
|
-
pub const synchronicity: enum {
|
|
47
|
-
always_synchronous,
|
|
48
|
-
always_asynchronous,
|
|
49
|
-
} = .always_asynchronous;
|
|
50
|
-
|
|
51
|
-
pub const Read = struct {
|
|
52
|
-
callback: fn (read: *Storage.Read) void,
|
|
53
|
-
buffer: []u8,
|
|
54
|
-
offset: u64,
|
|
55
|
-
/// Tick at which this read is considered "completed" and the callback should be called.
|
|
56
|
-
done_at_tick: u64,
|
|
57
|
-
|
|
58
|
-
fn less_than(context: void, a: *Read, b: *Read) math.Order {
|
|
59
|
-
_ = context;
|
|
60
|
-
|
|
61
|
-
return math.order(a.done_at_tick, b.done_at_tick);
|
|
62
|
-
}
|
|
63
|
-
};
|
|
64
|
-
|
|
65
|
-
pub const Write = struct {
|
|
66
|
-
callback: fn (write: *Storage.Write) void,
|
|
67
|
-
buffer: []const u8,
|
|
68
|
-
offset: u64,
|
|
69
|
-
/// Tick at which this write is considered "completed" and the callback should be called.
|
|
70
|
-
done_at_tick: u64,
|
|
71
|
-
|
|
72
|
-
fn less_than(context: void, a: *Write, b: *Write) math.Order {
|
|
73
|
-
_ = context;
|
|
74
|
-
|
|
75
|
-
return math.order(a.done_at_tick, b.done_at_tick);
|
|
76
|
-
}
|
|
77
|
-
};
|
|
78
|
-
|
|
79
|
-
/// Faulty areas are always sized to message_size_max
|
|
80
|
-
/// If the faulty areas of all replicas are superimposed, the padding between them is always message_size_max.
|
|
81
|
-
/// For a single replica, the padding between faulty areas depends on the number of other replicas.
|
|
82
|
-
pub const FaultyAreas = struct {
|
|
83
|
-
first_offset: u64,
|
|
84
|
-
period: u64,
|
|
85
|
-
};
|
|
86
|
-
|
|
87
|
-
memory: []align(config.sector_size) u8,
|
|
88
|
-
size: u64,
|
|
89
|
-
/// Set bits correspond to faulty sectors. The underlying sectors of `memory` is left clean.
|
|
90
|
-
faults: std.DynamicBitSetUnmanaged,
|
|
91
|
-
|
|
92
|
-
options: Options,
|
|
93
|
-
replica_index: u8,
|
|
94
|
-
prng: std.rand.DefaultPrng,
|
|
95
|
-
|
|
96
|
-
// We can't allow storage faults for the same message in a majority of
|
|
97
|
-
// the replicas as that would make recovery impossible. Instead, we only
|
|
98
|
-
// allow faults in certain areas which differ between replicas.
|
|
99
|
-
faulty_areas: FaultyAreas,
|
|
100
|
-
/// Whether to enable faults (when false, this supersedes `faulty_areas`).
|
|
101
|
-
/// This is used to disable faults during the replica's first startup.
|
|
102
|
-
faulty: bool = true,
|
|
103
|
-
|
|
104
|
-
reads: std.PriorityQueue(*Storage.Read, void, Storage.Read.less_than),
|
|
105
|
-
writes: std.PriorityQueue(*Storage.Write, void, Storage.Write.less_than),
|
|
106
|
-
|
|
107
|
-
ticks: u64 = 0,
|
|
108
|
-
|
|
109
|
-
pub fn init(
|
|
110
|
-
allocator: mem.Allocator,
|
|
111
|
-
size: u64,
|
|
112
|
-
options: Storage.Options,
|
|
113
|
-
replica_index: u8,
|
|
114
|
-
faulty_areas: FaultyAreas,
|
|
115
|
-
) !Storage {
|
|
116
|
-
assert(options.write_latency_mean >= options.write_latency_min);
|
|
117
|
-
assert(options.read_latency_mean >= options.read_latency_min);
|
|
118
|
-
|
|
119
|
-
const memory = try allocator.allocAdvanced(u8, config.sector_size, size, .exact);
|
|
120
|
-
errdefer allocator.free(memory);
|
|
121
|
-
// TODO: random data
|
|
122
|
-
mem.set(u8, memory, 0);
|
|
123
|
-
|
|
124
|
-
var faults = try std.DynamicBitSetUnmanaged.initEmpty(
|
|
125
|
-
allocator,
|
|
126
|
-
@divExact(size, config.sector_size),
|
|
127
|
-
);
|
|
128
|
-
errdefer faults.deinit(allocator);
|
|
129
|
-
|
|
130
|
-
var reads = std.PriorityQueue(*Storage.Read, void, Storage.Read.less_than).init(allocator, {});
|
|
131
|
-
errdefer reads.deinit();
|
|
132
|
-
try reads.ensureTotalCapacity(config.io_depth_read);
|
|
133
|
-
|
|
134
|
-
var writes = std.PriorityQueue(*Storage.Write, void, Storage.Write.less_than).init(allocator, {});
|
|
135
|
-
errdefer writes.deinit();
|
|
136
|
-
try writes.ensureTotalCapacity(config.io_depth_write);
|
|
137
|
-
|
|
138
|
-
return Storage{
|
|
139
|
-
.memory = memory,
|
|
140
|
-
.size = size,
|
|
141
|
-
.faults = faults,
|
|
142
|
-
.options = options,
|
|
143
|
-
.replica_index = replica_index,
|
|
144
|
-
.prng = std.rand.DefaultPrng.init(options.seed),
|
|
145
|
-
.faulty_areas = faulty_areas,
|
|
146
|
-
.reads = reads,
|
|
147
|
-
.writes = writes,
|
|
148
|
-
};
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/// Cancel any currently in progress reads/writes but leave the stored data untouched.
|
|
152
|
-
pub fn reset(storage: *Storage) void {
|
|
153
|
-
while (storage.writes.peek()) |write| {
|
|
154
|
-
_ = storage.writes.remove();
|
|
155
|
-
storage.fault_sectors(write.offset, write.buffer.len);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
storage.reads.len = 0;
|
|
159
|
-
assert(storage.writes.len == 0);
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
pub fn deinit(storage: *Storage, allocator: mem.Allocator) void {
|
|
163
|
-
allocator.free(storage.memory);
|
|
164
|
-
storage.faults.deinit(allocator);
|
|
165
|
-
storage.reads.deinit();
|
|
166
|
-
storage.writes.deinit();
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
pub fn tick(storage: *Storage) void {
|
|
170
|
-
storage.ticks += 1;
|
|
171
|
-
|
|
172
|
-
while (storage.reads.peek()) |read| {
|
|
173
|
-
if (read.done_at_tick > storage.ticks) break;
|
|
174
|
-
_ = storage.reads.remove();
|
|
175
|
-
storage.read_sectors_finish(read);
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
while (storage.writes.peek()) |write| {
|
|
179
|
-
if (write.done_at_tick > storage.ticks) break;
|
|
180
|
-
_ = storage.writes.remove();
|
|
181
|
-
storage.write_sectors_finish(write);
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
pub fn read_sectors(
|
|
186
|
-
storage: *Storage,
|
|
187
|
-
callback: fn (read: *Storage.Read) void,
|
|
188
|
-
read: *Storage.Read,
|
|
189
|
-
buffer: []u8,
|
|
190
|
-
offset: u64,
|
|
191
|
-
) void {
|
|
192
|
-
storage.assert_bounds_and_alignment(buffer, offset);
|
|
193
|
-
|
|
194
|
-
read.* = .{
|
|
195
|
-
.callback = callback,
|
|
196
|
-
.buffer = buffer,
|
|
197
|
-
.offset = offset,
|
|
198
|
-
.done_at_tick = storage.ticks + storage.read_latency(),
|
|
199
|
-
};
|
|
200
|
-
|
|
201
|
-
// We ensure the capacity is sufficient for config.io_depth_read in init()
|
|
202
|
-
storage.reads.add(read) catch unreachable;
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
fn read_sectors_finish(storage: *Storage, read: *Storage.Read) void {
|
|
206
|
-
mem.copy(u8, read.buffer, storage.memory[read.offset..][0..read.buffer.len]);
|
|
207
|
-
|
|
208
|
-
if (storage.x_in_100(storage.options.read_fault_probability)) {
|
|
209
|
-
storage.fault_sectors(read.offset, read.buffer.len);
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
if (storage.faulty) {
|
|
213
|
-
// Corrupt faulty sectors.
|
|
214
|
-
const sector_min = @divExact(read.offset, config.sector_size);
|
|
215
|
-
var sector: usize = 0;
|
|
216
|
-
while (sector < @divExact(read.buffer.len, config.sector_size)) : (sector += 1) {
|
|
217
|
-
if (storage.faults.isSet(sector_min + sector)) {
|
|
218
|
-
const faulty_sector_offset = sector * config.sector_size;
|
|
219
|
-
const faulty_sector_bytes = read.buffer[faulty_sector_offset..][0..config.sector_size];
|
|
220
|
-
storage.prng.random().bytes(faulty_sector_bytes);
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
read.callback(read);
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
pub fn write_sectors(
|
|
229
|
-
storage: *Storage,
|
|
230
|
-
callback: fn (write: *Storage.Write) void,
|
|
231
|
-
write: *Storage.Write,
|
|
232
|
-
buffer: []const u8,
|
|
233
|
-
offset: u64,
|
|
234
|
-
) void {
|
|
235
|
-
storage.assert_bounds_and_alignment(buffer, offset);
|
|
236
|
-
|
|
237
|
-
// Verify that there are no concurrent overlapping writes.
|
|
238
|
-
var iterator = storage.writes.iterator();
|
|
239
|
-
while (iterator.next()) |other| {
|
|
240
|
-
assert(offset + buffer.len <= other.offset or
|
|
241
|
-
other.offset + other.buffer.len <= offset);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
write.* = .{
|
|
245
|
-
.callback = callback,
|
|
246
|
-
.buffer = buffer,
|
|
247
|
-
.offset = offset,
|
|
248
|
-
.done_at_tick = storage.ticks + storage.write_latency(),
|
|
249
|
-
};
|
|
250
|
-
|
|
251
|
-
// We ensure the capacity is sufficient for config.io_depth_write in init()
|
|
252
|
-
storage.writes.add(write) catch unreachable;
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
fn write_sectors_finish(storage: *Storage, write: *Storage.Write) void {
|
|
256
|
-
mem.copy(u8, storage.memory[write.offset..][0..write.buffer.len], write.buffer);
|
|
257
|
-
|
|
258
|
-
{
|
|
259
|
-
const sector_min = @divExact(write.offset, config.sector_size);
|
|
260
|
-
const sector_max = @divExact(write.offset + write.buffer.len, config.sector_size);
|
|
261
|
-
var sector: usize = sector_min;
|
|
262
|
-
while (sector < sector_max) : (sector += 1) storage.faults.unset(sector);
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
if (storage.x_in_100(storage.options.write_fault_probability)) {
|
|
266
|
-
storage.fault_sectors(write.offset, write.buffer.len);
|
|
267
|
-
}
|
|
268
|
-
write.callback(write);
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
fn assert_bounds_and_alignment(storage: *const Storage, buffer: []const u8, offset: u64) void {
|
|
272
|
-
assert(buffer.len > 0);
|
|
273
|
-
assert(offset + buffer.len <= storage.size);
|
|
274
|
-
|
|
275
|
-
// Ensure that the read or write is aligned correctly for Direct I/O:
|
|
276
|
-
// If this is not the case, the underlying syscall will return EINVAL.
|
|
277
|
-
assert(@mod(@ptrToInt(buffer.ptr), config.sector_size) == 0);
|
|
278
|
-
assert(@mod(buffer.len, config.sector_size) == 0);
|
|
279
|
-
assert(@mod(offset, config.sector_size) == 0);
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
fn read_latency(storage: *Storage) u64 {
|
|
283
|
-
return storage.latency(storage.options.read_latency_min, storage.options.read_latency_mean);
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
fn write_latency(storage: *Storage) u64 {
|
|
287
|
-
return storage.latency(storage.options.write_latency_min, storage.options.write_latency_mean);
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
fn latency(storage: *Storage, min: u64, mean: u64) u64 {
|
|
291
|
-
return min + @floatToInt(u64, @intToFloat(f64, mean - min) * storage.prng.random().floatExp(f64));
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
/// Return true with probability x/100.
|
|
295
|
-
fn x_in_100(storage: *Storage, x: u8) bool {
|
|
296
|
-
assert(x <= 100);
|
|
297
|
-
return x > storage.prng.random().uintLessThan(u8, 100);
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
fn random_uint_between(storage: *Storage, comptime T: type, min: T, max: T) T {
|
|
301
|
-
return min + storage.prng.random().uintLessThan(T, max - min);
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
/// The return value is a slice into the provided out array.
|
|
305
|
-
pub fn generate_faulty_areas(
|
|
306
|
-
prng: std.rand.Random,
|
|
307
|
-
size: u64,
|
|
308
|
-
replica_count: u8,
|
|
309
|
-
out: *[config.replicas_max]FaultyAreas,
|
|
310
|
-
) []FaultyAreas {
|
|
311
|
-
comptime assert(config.message_size_max % config.sector_size == 0);
|
|
312
|
-
const message_size_max = config.message_size_max;
|
|
313
|
-
|
|
314
|
-
// We need to ensure there is message_size_max fault-free padding
|
|
315
|
-
// between faulty areas of memory so that a single message
|
|
316
|
-
// cannot straddle the corruptable areas of a majority of replicas.
|
|
317
|
-
comptime assert(config.replicas_max == 6);
|
|
318
|
-
switch (replica_count) {
|
|
319
|
-
1 => {
|
|
320
|
-
// If there is only one replica in the cluster, storage faults are not recoverable.
|
|
321
|
-
out[0] = .{ .first_offset = size, .period = 1 };
|
|
322
|
-
},
|
|
323
|
-
2 => {
|
|
324
|
-
// 0123456789
|
|
325
|
-
// 0X X X
|
|
326
|
-
// 1 X X X
|
|
327
|
-
out[0] = .{ .first_offset = 0 * message_size_max, .period = 4 * message_size_max };
|
|
328
|
-
out[1] = .{ .first_offset = 2 * message_size_max, .period = 4 * message_size_max };
|
|
329
|
-
},
|
|
330
|
-
3 => {
|
|
331
|
-
// 0123456789
|
|
332
|
-
// 0X X
|
|
333
|
-
// 1 X X
|
|
334
|
-
// 2 X X
|
|
335
|
-
out[0] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
|
|
336
|
-
out[1] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
|
|
337
|
-
out[2] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
|
|
338
|
-
},
|
|
339
|
-
4 => {
|
|
340
|
-
// 0123456789
|
|
341
|
-
// 0X X X
|
|
342
|
-
// 1X X X
|
|
343
|
-
// 2 X X X
|
|
344
|
-
// 3 X X X
|
|
345
|
-
out[0] = .{ .first_offset = 0 * message_size_max, .period = 4 * message_size_max };
|
|
346
|
-
out[1] = .{ .first_offset = 0 * message_size_max, .period = 4 * message_size_max };
|
|
347
|
-
out[2] = .{ .first_offset = 2 * message_size_max, .period = 4 * message_size_max };
|
|
348
|
-
out[3] = .{ .first_offset = 2 * message_size_max, .period = 4 * message_size_max };
|
|
349
|
-
},
|
|
350
|
-
5 => {
|
|
351
|
-
// 0123456789
|
|
352
|
-
// 0X X
|
|
353
|
-
// 1X X
|
|
354
|
-
// 2 X X
|
|
355
|
-
// 3 X X
|
|
356
|
-
// 4 X X
|
|
357
|
-
out[0] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
|
|
358
|
-
out[1] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
|
|
359
|
-
out[2] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
|
|
360
|
-
out[3] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
|
|
361
|
-
out[4] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
|
|
362
|
-
},
|
|
363
|
-
6 => {
|
|
364
|
-
// 0123456789
|
|
365
|
-
// 0X X
|
|
366
|
-
// 1X X
|
|
367
|
-
// 2 X X
|
|
368
|
-
// 3 X X
|
|
369
|
-
// 4 X X
|
|
370
|
-
// 5 X X
|
|
371
|
-
out[0] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
|
|
372
|
-
out[1] = .{ .first_offset = 0 * message_size_max, .period = 6 * message_size_max };
|
|
373
|
-
out[2] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
|
|
374
|
-
out[3] = .{ .first_offset = 2 * message_size_max, .period = 6 * message_size_max };
|
|
375
|
-
out[4] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
|
|
376
|
-
out[5] = .{ .first_offset = 4 * message_size_max, .period = 6 * message_size_max };
|
|
377
|
-
},
|
|
378
|
-
else => unreachable,
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
{
|
|
382
|
-
// Allow at most `f` faulty replicas to ensure the view change can succeed.
|
|
383
|
-
// TODO Allow more than `f` faulty replicas when the fault is to the right of the
|
|
384
|
-
// highest known replica.op (and to the left of the last checkpointed op).
|
|
385
|
-
const majority = @divFloor(replica_count, 2) + 1;
|
|
386
|
-
const quorum_replication = std.math.min(config.quorum_replication_max, majority);
|
|
387
|
-
const quorum_view_change = std.math.max(
|
|
388
|
-
replica_count - quorum_replication + 1,
|
|
389
|
-
majority,
|
|
390
|
-
);
|
|
391
|
-
var i: usize = quorum_view_change;
|
|
392
|
-
while (i < replica_count) : (i += 1) {
|
|
393
|
-
out[i].first_offset = size;
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
prng.shuffle(FaultyAreas, out[0..replica_count]);
|
|
398
|
-
return out[0..replica_count];
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
const SectorRange = struct {
|
|
402
|
-
min: usize, // inclusive sector index
|
|
403
|
-
max: usize, // exclusive sector index
|
|
404
|
-
};
|
|
405
|
-
|
|
406
|
-
/// Given an offset and size of a read/write, returns the range of any faulty sectors touched
|
|
407
|
-
/// by the read/write.
|
|
408
|
-
fn faulty_sectors(storage: *const Storage, offset: u64, size: u64) ?SectorRange {
|
|
409
|
-
assert(size <= config.message_size_max);
|
|
410
|
-
const message_size_max = config.message_size_max;
|
|
411
|
-
const period = storage.faulty_areas.period;
|
|
412
|
-
|
|
413
|
-
const faulty_offset = storage.faulty_areas.first_offset + (offset / period) * period;
|
|
414
|
-
|
|
415
|
-
const start = std.math.max(offset, faulty_offset);
|
|
416
|
-
const end = std.math.min(offset + size, faulty_offset + message_size_max);
|
|
417
|
-
|
|
418
|
-
// The read/write does not touch any faulty sectors.
|
|
419
|
-
if (start >= end) return null;
|
|
420
|
-
|
|
421
|
-
return SectorRange{
|
|
422
|
-
.min = @divExact(start, config.sector_size),
|
|
423
|
-
.max = @divExact(end, config.sector_size),
|
|
424
|
-
};
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
fn fault_sectors(storage: *Storage, offset: u64, size: u64) void {
|
|
428
|
-
const faulty = storage.faulty_sectors(offset, size) orelse return;
|
|
429
|
-
// Randomly corrupt one of the faulty sectors the operation targeted.
|
|
430
|
-
// TODO: inject more realistic and varied storage faults as described above.
|
|
431
|
-
const faulty_sector = storage.random_uint_between(usize, faulty.min, faulty.max);
|
|
432
|
-
log.info("corrupting sector {} by replica {}", .{
|
|
433
|
-
faulty_sector,
|
|
434
|
-
storage.replica_index,
|
|
435
|
-
});
|
|
436
|
-
storage.faults.set(faulty_sector);
|
|
437
|
-
}
|
|
438
|
-
};
|