tigerbeetle-node 0.11.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -3
- package/src/tigerbeetle/scripts/fuzz_loop.sh +1 -1
- package/src/tigerbeetle/scripts/pre-commit.sh +2 -2
- package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
- package/src/tigerbeetle/src/benchmark.zig +25 -11
- package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
- package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
- package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -256
- package/src/tigerbeetle/src/c/tb_client.h +18 -4
- package/src/tigerbeetle/src/c/tb_client.zig +88 -26
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
- package/src/tigerbeetle/src/c/test.zig +371 -1
- package/src/tigerbeetle/src/cli.zig +36 -6
- package/src/tigerbeetle/src/config.zig +10 -1
- package/src/tigerbeetle/src/demo.zig +2 -1
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
- package/src/tigerbeetle/src/ewah.zig +11 -33
- package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
- package/src/tigerbeetle/src/lsm/README.md +97 -3
- package/src/tigerbeetle/src/lsm/compaction.zig +32 -7
- package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +34 -32
- package/src/tigerbeetle/src/lsm/grid.zig +39 -21
- package/src/tigerbeetle/src/lsm/groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +3 -3
- package/src/tigerbeetle/src/lsm/level_iterator.zig +1 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +13 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -49
- package/src/tigerbeetle/src/lsm/manifest_log.zig +173 -335
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +24 -15
- package/src/tigerbeetle/src/lsm/table.zig +32 -20
- package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/table_iterator.zig +4 -5
- package/src/tigerbeetle/src/lsm/test.zig +13 -2
- package/src/tigerbeetle/src/lsm/tree.zig +45 -7
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +36 -32
- package/src/tigerbeetle/src/main.zig +55 -2
- package/src/tigerbeetle/src/message_bus.zig +18 -7
- package/src/tigerbeetle/src/message_pool.zig +8 -2
- package/src/tigerbeetle/src/ring_buffer.zig +7 -3
- package/src/tigerbeetle/src/simulator.zig +38 -11
- package/src/tigerbeetle/src/state_machine.zig +47 -22
- package/src/tigerbeetle/src/test/accounting/workload.zig +9 -5
- package/src/tigerbeetle/src/test/cluster.zig +15 -33
- package/src/tigerbeetle/src/test/conductor.zig +2 -1
- package/src/tigerbeetle/src/test/network.zig +45 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
- package/src/tigerbeetle/src/test/state_checker.zig +5 -7
- package/src/tigerbeetle/src/test/storage.zig +453 -110
- package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +1 -0
- package/src/tigerbeetle/src/unit_tests.zig +6 -1
- package/src/tigerbeetle/src/util.zig +97 -11
- package/src/tigerbeetle/src/vopr.zig +2 -1
- package/src/tigerbeetle/src/vsr/client.zig +8 -3
- package/src/tigerbeetle/src/vsr/journal.zig +280 -202
- package/src/tigerbeetle/src/vsr/replica.zig +169 -31
- package/src/tigerbeetle/src/vsr/superblock.zig +356 -629
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -6
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +414 -151
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +44 -9
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
- package/src/tigerbeetle/src/vsr.zig +19 -5
- package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
- package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
- package/src/tigerbeetle/src/vopr_hub/README.md +0 -58
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +0 -199
- package/src/tigerbeetle/src/vopr_hub/go.mod +0 -3
- package/src/tigerbeetle/src/vopr_hub/main.go +0 -1022
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +0 -3
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +0 -403
|
@@ -35,15 +35,16 @@ pub fn main() !void {
|
|
|
35
35
|
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
|
|
36
36
|
defer arena.deinit();
|
|
37
37
|
|
|
38
|
+
const allocator = arena.allocator();
|
|
38
39
|
var i: usize = 0;
|
|
39
40
|
var bitsets: [samples][]usize = undefined;
|
|
40
41
|
var bitsets_encoded: [samples][]align(@alignOf(usize)) u8 = undefined;
|
|
41
42
|
var bitsets_decoded: [samples][]usize = undefined;
|
|
42
43
|
var bitset_lengths: [samples]usize = undefined;
|
|
43
44
|
while (i < samples) : (i += 1) {
|
|
44
|
-
bitsets[i] = try make_bitset(
|
|
45
|
-
bitsets_encoded[i] = try
|
|
46
|
-
bitsets_decoded[i] = try
|
|
45
|
+
bitsets[i] = try make_bitset(allocator, config);
|
|
46
|
+
bitsets_encoded[i] = try allocator.alignedAlloc(u8, @alignOf(usize), ewah.encode_size_max(bitsets[0].len));
|
|
47
|
+
bitsets_decoded[i] = try allocator.alloc(usize, config.words);
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
// Benchmark encoding.
|
|
@@ -96,16 +97,14 @@ pub fn main() !void {
|
|
|
96
97
|
}
|
|
97
98
|
}
|
|
98
99
|
|
|
99
|
-
fn make_bitset(allocator:
|
|
100
|
+
fn make_bitset(allocator: std.mem.Allocator, config: BitSetConfig) ![]usize {
|
|
100
101
|
var words = try allocator.alloc(usize, config.words);
|
|
101
102
|
var w: usize = 0;
|
|
102
|
-
var run: bool = true;
|
|
103
103
|
var literal: usize = 1;
|
|
104
104
|
while (w < words.len) : (w += 1) {
|
|
105
|
-
const
|
|
106
|
-
const
|
|
107
|
-
const
|
|
108
|
-
const run_bit = prng.random.boolean();
|
|
105
|
+
const run_length = prng.random().uintLessThan(usize, 2 * config.run_length_e);
|
|
106
|
+
const literals_length = prng.random().uintLessThan(usize, 2 * config.literals_length_e);
|
|
107
|
+
const run_bit = prng.random().boolean();
|
|
109
108
|
|
|
110
109
|
const run_end = std.math.min(w + run_length, words.len);
|
|
111
110
|
while (w < run_end) : (w += 1) {
|
|
@@ -25,7 +25,7 @@ A tree is a hierarchy of in-memory and on-disk tables. There are three categorie
|
|
|
25
25
|
- The mutable table's contents are periodically moved to the immutable table,
|
|
26
26
|
where they are stored while being flushed to level `0`.
|
|
27
27
|
- Level `0` … level `config.lsm_levels - 1` each contain an exponentially increasing number of
|
|
28
|
-
on-disk tables.
|
|
28
|
+
immutable on-disk tables.
|
|
29
29
|
- Each tree has as many as `config.lsm_growth_factor ^ (level + 1)` tables per level.
|
|
30
30
|
(`config.lsm_growth_factor` is typically 8).
|
|
31
31
|
- Within a given level and snapshot, the tables' key ranges are [disjoint](manifest_level.zig).
|
|
@@ -47,7 +47,7 @@ The first half of the bar compacts even levels while the latter compacts odd lev
|
|
|
47
47
|
Mutable table changes are sorted and compacted into the immutable table.
|
|
48
48
|
The immutable table is compacted into level 0 during the odd level half of the bar.
|
|
49
49
|
|
|
50
|
-
At any given point, there are at most
|
|
50
|
+
At any given point, there are at most `⌈levels/2⌉` compactions running concurrently.
|
|
51
51
|
The source level is denoted as `level_a` and the target level as `level_b`.
|
|
52
52
|
The last level in the LSM tree has no target level so it is never a source level.
|
|
53
53
|
Each compaction compacts a [single table](#table-selection) from `level_a` into all tables in
|
|
@@ -55,7 +55,7 @@ Each compaction compacts a [single table](#table-selection) from `level_a` into
|
|
|
55
55
|
|
|
56
56
|
Invariants:
|
|
57
57
|
* At the end of every beat, there is space in mutable table for the next beat.
|
|
58
|
-
* The manifest is compacted at the end of every beat.
|
|
58
|
+
* The manifest log is compacted at the end of every beat.
|
|
59
59
|
* The compactions' output tables are not [visible](#snapshots-and-compaction) until the compaction has finished.
|
|
60
60
|
|
|
61
61
|
1. First half-bar, first beat ("first beat"):
|
|
@@ -63,15 +63,20 @@ Invariants:
|
|
|
63
63
|
* Allow the per-level table limits to overflow if needed (for example, if we may compact a table
|
|
64
64
|
from level `A` to level `B`, where level `B` is already full).
|
|
65
65
|
* Start compactions from even levels that have reached their table limit.
|
|
66
|
+
* Acquire reservations from the Free Set for all blocks (upper-bound) that will be written
|
|
67
|
+
during this half-bar.
|
|
66
68
|
|
|
67
69
|
2. First half-bar, last beat:
|
|
68
70
|
* Finish ticking any incomplete even-level compactions.
|
|
69
71
|
* Assert on callback completion that all compactions are complete.
|
|
72
|
+
* Release reservations from the Free Set.
|
|
70
73
|
|
|
71
74
|
3. Second half-bar, first beat ("middle beat"):
|
|
72
75
|
* Assert no compactions are currently running.
|
|
73
76
|
* Start compactions from odd levels that have reached their table limit.
|
|
74
77
|
* Compact the immutable table if it contains any sorted values (it might be empty).
|
|
78
|
+
* Acquire reservations from the Free Set for all blocks (upper-bound) that will be written
|
|
79
|
+
during this half-bar.
|
|
75
80
|
|
|
76
81
|
4. Second half-bar, last beat:
|
|
77
82
|
* Finish ticking any incomplete odd-level and immutable table compactions.
|
|
@@ -79,6 +84,7 @@ Invariants:
|
|
|
79
84
|
* Assert on callback completion that no level's table count overflows.
|
|
80
85
|
* Flush, clear, and sort mutable table values into immutable table for next bar.
|
|
81
86
|
* Remove input tables that are invisible to all current and persisted snapshots.
|
|
87
|
+
* Release reservations from the Free Set.
|
|
82
88
|
|
|
83
89
|
### Compaction Selection Policy
|
|
84
90
|
|
|
@@ -212,3 +218,91 @@ At the end of the last beat of the compaction bar (`23`):
|
|
|
212
218
|
- Updates from ops `0→B` (`0…15`) are on disk.
|
|
213
219
|
- Updates from ops `B→C` (`16…23`) are moved from the mutable table to the immutable table.
|
|
214
220
|
- `tree.lookup_snapshot_max` is `x` when committing op `x` (for `x ∈ {24,25,…}`).
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
## Manifest
|
|
224
|
+
|
|
225
|
+
The manifest is a tree's index of table locations and metadata.
|
|
226
|
+
(Not to be confused with the [SuperBlock Manifest](../vsr/README.md#manifest)).
|
|
227
|
+
|
|
228
|
+
Each manifest has two components:
|
|
229
|
+
- a single [`ManifestLog`](#manifest-log) shared by all levels, and
|
|
230
|
+
- one [`ManifestLevel`](#manifest-level) for each on-disk level.
|
|
231
|
+
|
|
232
|
+
### Manifest Log
|
|
233
|
+
|
|
234
|
+
The manifest log is an on-disk log of all updates to the tree's table index.
|
|
235
|
+
|
|
236
|
+
The manifest log tracks:
|
|
237
|
+
|
|
238
|
+
- tables created as compaction output
|
|
239
|
+
- tables updated as compaction input (modifying their `snapshot_max`)
|
|
240
|
+
- tables moved between levels by compaction
|
|
241
|
+
- tables deleted after compaction
|
|
242
|
+
|
|
243
|
+
Updates are accumulated in-memory before being flushed:
|
|
244
|
+
|
|
245
|
+
- incrementally during compaction, or
|
|
246
|
+
- in their entirety during checkpoint.
|
|
247
|
+
|
|
248
|
+
The manifest log is periodically compacted to remove older entries that have been superseded by
|
|
249
|
+
newer entries. For example, if a table is created and later deleted, manifest log compaction
|
|
250
|
+
will eventually remove any reference to the table from the log blocks.
|
|
251
|
+
|
|
252
|
+
All manifest log blocks are tracked in the superblock manifest.
|
|
253
|
+
|
|
254
|
+
### Manifest Level
|
|
255
|
+
|
|
256
|
+
A `ManifestLevel` is an in-memory collection of the table metadata for a single level of a tree.
|
|
257
|
+
|
|
258
|
+
For a given level and snapshot, there may be gaps in the key ranges of the visible tables,
|
|
259
|
+
but the key ranges are disjoint.
|
|
260
|
+
|
|
261
|
+
Manifest levels are queried for tables at a target snapshot and within a key range.
|
|
262
|
+
|
|
263
|
+
#### Example
|
|
264
|
+
|
|
265
|
+
Given the `ManifestLevel` tables (with values chosen for visualization, not realism):
|
|
266
|
+
|
|
267
|
+
label A B C D E F G H I J K L M
|
|
268
|
+
key_min 0 4 12 16 4 8 12 26 4 25 4 16 24
|
|
269
|
+
key_max 3 11 15 19 7 11 15 27 7 27 11 19 27
|
|
270
|
+
snapshot_min 1 1 1 1 3 3 3 3 5 5 7 7 7
|
|
271
|
+
snapshot_max 9 3 3 7 5 7 9 5 7 7 9 9 9
|
|
272
|
+
|
|
273
|
+
A level's tables can be visualized in 2D as a partitioned rectangle:
|
|
274
|
+
|
|
275
|
+
0 1 2
|
|
276
|
+
0 4 8 2 6 0 4 8
|
|
277
|
+
9┌───┬───────┬───┬───┬───┬───┐
|
|
278
|
+
│ │ K │ │ L │###│ M │
|
|
279
|
+
7│ ├───┬───┤ ├───┤###└┬──┤
|
|
280
|
+
│ │ I │ │ G │ │####│ J│
|
|
281
|
+
5│ A ├───┤ F │ │ │####└┬─┤
|
|
282
|
+
│ │ E │ │ │ D │#####│H│
|
|
283
|
+
3│ ├───┴───┼───┤ │#####└─┤
|
|
284
|
+
│ │ B │ C │ │#######│
|
|
285
|
+
1└───┴───────┴───┴───┴───────┘
|
|
286
|
+
|
|
287
|
+
Example iterations:
|
|
288
|
+
|
|
289
|
+
visibility snapshots direction key_min key_max tables
|
|
290
|
+
visible 2 ascending 0 28 A, B, C, D
|
|
291
|
+
visible 4 ascending 0 28 A, E, F, G, D, H
|
|
292
|
+
visible 6 descending 12 28 J, D, G
|
|
293
|
+
visible 8 ascending 0 28 A, K, G, L, M
|
|
294
|
+
invisible 2, 4, 6 ascending 0 28 K, L, M
|
|
295
|
+
|
|
296
|
+
Legend:
|
|
297
|
+
|
|
298
|
+
- `#` represents a gap — no tables cover these keys during the snapshot.
|
|
299
|
+
- The horizontal axis represents the key range.
|
|
300
|
+
- The vertical axis represents the snapshot range.
|
|
301
|
+
- Each rectangle is a table within the manifest level.
|
|
302
|
+
- The sides of each rectangle depict:
|
|
303
|
+
- left: `table.key_min` (the diagram is inclusive, and the `table.key_min` is inclusive)
|
|
304
|
+
- right: `table.key_max` (the diagram is EXCLUSIVE, but the `table.key_max` is INCLUSIVE)
|
|
305
|
+
- bottom: `table.snapshot_min` (inclusive)
|
|
306
|
+
- top: `table.snapshot_max` (inclusive)
|
|
307
|
+
- (Not depicted: tables may have `table.key_min == table.key_max`.)
|
|
308
|
+
- (Not depicted: the newest set of tables would have `table.snapshot_max == maxInt(u64)`.)
|
|
@@ -85,7 +85,7 @@ pub fn CompactionType(
|
|
|
85
85
|
);
|
|
86
86
|
|
|
87
87
|
const MergeStreamSelector = struct {
|
|
88
|
-
fn peek(compaction: *const Compaction, stream_id: u32) error{Empty, Drained}!Key {
|
|
88
|
+
fn peek(compaction: *const Compaction, stream_id: u32) error{ Empty, Drained }!Key {
|
|
89
89
|
return switch (stream_id) {
|
|
90
90
|
0 => compaction.iterator_a.peek(),
|
|
91
91
|
1 => compaction.iterator_b.peek(),
|
|
@@ -121,6 +121,7 @@ pub fn CompactionType(
|
|
|
121
121
|
};
|
|
122
122
|
|
|
123
123
|
grid: *Grid,
|
|
124
|
+
grid_reservation: Grid.Reservation,
|
|
124
125
|
range: Manifest.CompactionRange,
|
|
125
126
|
|
|
126
127
|
/// `op_min` is the first op/beat of this compaction's half-bar.
|
|
@@ -152,6 +153,8 @@ pub fn CompactionType(
|
|
|
152
153
|
level_b: u8,
|
|
153
154
|
level_a_input: ?TableInfo,
|
|
154
155
|
|
|
156
|
+
tables_output_count: usize = 0,
|
|
157
|
+
|
|
155
158
|
pub fn init(allocator: mem.Allocator) !Compaction {
|
|
156
159
|
var iterator_a = try IteratorA.init(allocator);
|
|
157
160
|
errdefer iterator_a.deinit(allocator);
|
|
@@ -165,6 +168,7 @@ pub fn CompactionType(
|
|
|
165
168
|
return Compaction{
|
|
166
169
|
// Assigned by start()
|
|
167
170
|
.grid = undefined,
|
|
171
|
+
.grid_reservation = undefined,
|
|
168
172
|
.range = undefined,
|
|
169
173
|
.op_min = undefined,
|
|
170
174
|
.drop_tombstones = undefined,
|
|
@@ -227,6 +231,20 @@ pub fn CompactionType(
|
|
|
227
231
|
|
|
228
232
|
compaction.* = .{
|
|
229
233
|
.grid = grid,
|
|
234
|
+
// Reserve enough blocks to write our output tables in the worst case, where:
|
|
235
|
+
// - no tombstones are dropped,
|
|
236
|
+
// - no values are overwritten,
|
|
237
|
+
// - and all tables are full.
|
|
238
|
+
//
|
|
239
|
+
// We must reserve before doing any async work so that the block acquisition order
|
|
240
|
+
// is deterministic (relative to other concurrent compactions).
|
|
241
|
+
// TODO The replica must stop accepting requests if it runs out of blocks/capacity,
|
|
242
|
+
// rather than panicking here.
|
|
243
|
+
// TODO(Compaction Pacing): Reserve smaller increments, at the start of each beat.
|
|
244
|
+
// (And likewise release the reservation at the end of each beat, instead of at the
|
|
245
|
+
// end of each half-bar).
|
|
246
|
+
// TODO(Move Table) Don't reserve these when we just move the table to the next level.
|
|
247
|
+
.grid_reservation = grid.reserve(range.table_count * Table.block_count_max).?,
|
|
230
248
|
.range = range,
|
|
231
249
|
.op_min = op_min,
|
|
232
250
|
.drop_tombstones = drop_tombstones,
|
|
@@ -303,12 +321,12 @@ pub fn CompactionType(
|
|
|
303
321
|
// This is safe; iterator_b makes a copy of the block before calling us.
|
|
304
322
|
const grid = compaction.grid;
|
|
305
323
|
for (Table.index_data_addresses_used(index_block)) |address| {
|
|
306
|
-
grid.
|
|
324
|
+
grid.release(address);
|
|
307
325
|
}
|
|
308
326
|
for (Table.index_filter_addresses_used(index_block)) |address| {
|
|
309
|
-
grid.
|
|
327
|
+
grid.release(address);
|
|
310
328
|
}
|
|
311
|
-
grid.
|
|
329
|
+
grid.release(Table.index_block_address(index_block));
|
|
312
330
|
}
|
|
313
331
|
|
|
314
332
|
pub fn compact_tick(compaction: *Compaction, callback: Callback) void {
|
|
@@ -441,7 +459,7 @@ pub fn CompactionType(
|
|
|
441
459
|
{
|
|
442
460
|
compaction.table_builder.data_block_finish(.{
|
|
443
461
|
.cluster = compaction.grid.superblock.working.cluster,
|
|
444
|
-
.address = compaction.grid.acquire(),
|
|
462
|
+
.address = compaction.grid.acquire(compaction.grid_reservation),
|
|
445
463
|
});
|
|
446
464
|
|
|
447
465
|
// Mark the finished data block as writable for the next compact_tick() call.
|
|
@@ -457,7 +475,7 @@ pub fn CompactionType(
|
|
|
457
475
|
{
|
|
458
476
|
compaction.table_builder.filter_block_finish(.{
|
|
459
477
|
.cluster = compaction.grid.superblock.working.cluster,
|
|
460
|
-
.address = compaction.grid.acquire(),
|
|
478
|
+
.address = compaction.grid.acquire(compaction.grid_reservation),
|
|
461
479
|
});
|
|
462
480
|
|
|
463
481
|
// Mark the finished filter block as writable for the next compact_tick() call.
|
|
@@ -473,7 +491,7 @@ pub fn CompactionType(
|
|
|
473
491
|
{
|
|
474
492
|
const table = compaction.table_builder.index_block_finish(.{
|
|
475
493
|
.cluster = compaction.grid.superblock.working.cluster,
|
|
476
|
-
.address = compaction.grid.acquire(),
|
|
494
|
+
.address = compaction.grid.acquire(compaction.grid_reservation),
|
|
477
495
|
.snapshot_min = snapshot_min_for_table_output(compaction.op_min),
|
|
478
496
|
// TODO(Persistent Snapshots) set snapshot_max to the minimum snapshot_max of
|
|
479
497
|
// all the (original) input tables.
|
|
@@ -484,6 +502,9 @@ pub fn CompactionType(
|
|
|
484
502
|
compaction.index.block = compaction.table_builder.index_block;
|
|
485
503
|
assert(!compaction.index.writable);
|
|
486
504
|
compaction.index.writable = true;
|
|
505
|
+
|
|
506
|
+
compaction.tables_output_count += 1;
|
|
507
|
+
assert(compaction.tables_output_count <= compaction.range.table_count);
|
|
487
508
|
}
|
|
488
509
|
}
|
|
489
510
|
|
|
@@ -540,6 +561,10 @@ pub fn CompactionType(
|
|
|
540
561
|
assert(compaction.io_pending == 0);
|
|
541
562
|
assert(compaction.merge_done);
|
|
542
563
|
|
|
564
|
+
// TODO(Beat Pacing) This should really be where the compaction callback is invoked,
|
|
565
|
+
// but currently that can occur multiple times per beat.
|
|
566
|
+
compaction.grid.forfeit(compaction.grid_reservation);
|
|
567
|
+
|
|
543
568
|
compaction.status = .idle;
|
|
544
569
|
compaction.merge_done = false;
|
|
545
570
|
}
|
|
@@ -2,9 +2,9 @@ const std = @import("std");
|
|
|
2
2
|
const assert = std.debug.assert;
|
|
3
3
|
const math = std.math;
|
|
4
4
|
|
|
5
|
-
const
|
|
5
|
+
const binary_search_keys_raw = @import("./binary_search.zig").binary_search_keys_raw;
|
|
6
|
+
const binary_search_values_raw = @import("./binary_search.zig").binary_search_values_raw;
|
|
6
7
|
const eytzinger = @import("./eytzinger.zig").eytzinger;
|
|
7
|
-
const perf = @import("./benchmarks/perf.zig");
|
|
8
8
|
|
|
9
9
|
const GiB = 1 << 30;
|
|
10
10
|
const searches = 500_000;
|
|
@@ -53,7 +53,7 @@ pub fn main() !void {
|
|
|
53
53
|
defer arena.deinit();
|
|
54
54
|
|
|
55
55
|
const blob_size = GiB;
|
|
56
|
-
var blob = try arena.allocator.alloc(u8, blob_size);
|
|
56
|
+
var blob = try arena.allocator().alloc(u8, blob_size);
|
|
57
57
|
|
|
58
58
|
inline for (kv_types) |kv| {
|
|
59
59
|
inline for (values_per_page) |values_count, v| {
|
|
@@ -65,13 +65,13 @@ pub fn main() !void {
|
|
|
65
65
|
.keys_count = keys_count,
|
|
66
66
|
.values_count = values_count,
|
|
67
67
|
.searches = searches,
|
|
68
|
-
}, blob,
|
|
68
|
+
}, blob, prng.random());
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
71
|
}
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
-
fn run_benchmark(comptime layout: Layout, blob: []u8, random:
|
|
74
|
+
fn run_benchmark(comptime layout: Layout, blob: []u8, random: std.rand.Random) !void {
|
|
75
75
|
assert(blob.len == layout.blob_size);
|
|
76
76
|
const Eytzinger = eytzinger(layout.keys_count - 1, layout.values_count);
|
|
77
77
|
const V = Value(layout);
|
|
@@ -88,7 +88,7 @@ fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random)
|
|
|
88
88
|
|
|
89
89
|
// Generate 1GiB worth of 24KiB pages.
|
|
90
90
|
var blob_alloc = std.heap.FixedBufferAllocator.init(blob);
|
|
91
|
-
var pages = try blob_alloc.allocator.alloc(Page, page_count);
|
|
91
|
+
var pages = try blob_alloc.allocator().alloc(Page, page_count);
|
|
92
92
|
random.bytes(std.mem.sliceAsBytes(pages));
|
|
93
93
|
for (pages) |*page| {
|
|
94
94
|
for (page.values) |*value, i| value.key = i;
|
|
@@ -105,7 +105,17 @@ fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random)
|
|
|
105
105
|
const target = value_picker[v % value_picker.len];
|
|
106
106
|
const page = &pages[page_index];
|
|
107
107
|
const bounds = Eytzinger.search_values(K, V, V.key_compare, &page.keys, &page.values, target);
|
|
108
|
-
const hit = bounds[
|
|
108
|
+
const hit = bounds[
|
|
109
|
+
binary_search_values_raw(
|
|
110
|
+
K,
|
|
111
|
+
V,
|
|
112
|
+
V.key_from_value,
|
|
113
|
+
V.key_compare,
|
|
114
|
+
bounds,
|
|
115
|
+
target,
|
|
116
|
+
.{},
|
|
117
|
+
)
|
|
118
|
+
];
|
|
109
119
|
|
|
110
120
|
assert(hit.key == target);
|
|
111
121
|
if (i % pages.len == 0) v += 1;
|
|
@@ -136,7 +146,17 @@ fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random)
|
|
|
136
146
|
while (i < layout.searches) : (i += 1) {
|
|
137
147
|
const target = value_picker[v % value_picker.len];
|
|
138
148
|
const page = &pages[page_picker[i % page_picker.len]];
|
|
139
|
-
const hit = page.values[
|
|
149
|
+
const hit = page.values[
|
|
150
|
+
binary_search_values_raw(
|
|
151
|
+
K,
|
|
152
|
+
V,
|
|
153
|
+
V.key_from_value,
|
|
154
|
+
V.key_compare,
|
|
155
|
+
page.values[0..],
|
|
156
|
+
target,
|
|
157
|
+
.{},
|
|
158
|
+
)
|
|
159
|
+
];
|
|
140
160
|
|
|
141
161
|
assert(hit.key == target);
|
|
142
162
|
if (i % pages.len == 0) v += 1;
|
|
@@ -182,14 +202,10 @@ fn Value(comptime layout: Layout) type {
|
|
|
182
202
|
assert(@sizeOf(Self) == layout.value_size);
|
|
183
203
|
}
|
|
184
204
|
|
|
185
|
-
inline fn key_from_value(self: Self) Key {
|
|
205
|
+
inline fn key_from_value(self: *const Self) Key {
|
|
186
206
|
return self.key;
|
|
187
207
|
}
|
|
188
208
|
|
|
189
|
-
inline fn key_from_key(x: Key) Key {
|
|
190
|
-
return x;
|
|
191
|
-
}
|
|
192
|
-
|
|
193
209
|
inline fn key_compare(a: Key, b: Key) math.Order {
|
|
194
210
|
return math.order(a, b);
|
|
195
211
|
}
|
|
@@ -206,9 +222,7 @@ const BenchmarkResult = struct {
|
|
|
206
222
|
branch_misses: usize,
|
|
207
223
|
};
|
|
208
224
|
|
|
209
|
-
const PERF =
|
|
210
|
-
const perf_event_attr = perf.perf_event_attr;
|
|
211
|
-
const perf_event_open = perf.perf_event_open;
|
|
225
|
+
const PERF = std.os.linux.PERF;
|
|
212
226
|
const perf_counters = [_]PERF.COUNT.HW{
|
|
213
227
|
PERF.COUNT.HW.CPU_CYCLES,
|
|
214
228
|
PERF.COUNT.HW.INSTRUCTIONS,
|
|
@@ -223,10 +237,9 @@ const Benchmark = struct {
|
|
|
223
237
|
perf_fds: [perf_counters.len]std.os.fd_t,
|
|
224
238
|
|
|
225
239
|
fn begin() !Benchmark {
|
|
226
|
-
const flags = PERF.FLAG.FD_NO_GROUP;
|
|
227
240
|
var perf_fds = [1]std.os.fd_t{-1} ** perf_counters.len;
|
|
228
241
|
for (perf_counters) |counter, i| {
|
|
229
|
-
var attr: perf_event_attr = .{
|
|
242
|
+
var attr: std.os.linux.perf_event_attr = .{
|
|
230
243
|
.type = PERF.TYPE.HARDWARE,
|
|
231
244
|
.config = @enumToInt(counter),
|
|
232
245
|
.flags = .{
|
|
@@ -235,7 +248,7 @@ const Benchmark = struct {
|
|
|
235
248
|
.exclude_hv = true,
|
|
236
249
|
},
|
|
237
250
|
};
|
|
238
|
-
perf_fds[i] = try perf_event_open(&attr, 0, -1, perf_fds[0], PERF.FLAG.FD_CLOEXEC);
|
|
251
|
+
perf_fds[i] = try std.os.perf_event_open(&attr, 0, -1, perf_fds[0], PERF.FLAG.FD_CLOEXEC);
|
|
239
252
|
}
|
|
240
253
|
const err = std.os.linux.ioctl(perf_fds[0], PERF.EVENT_IOC.ENABLE, PERF.IOC_FLAG_GROUP);
|
|
241
254
|
if (err == -1) return error.Unexpected;
|
|
@@ -274,7 +287,7 @@ const Benchmark = struct {
|
|
|
274
287
|
};
|
|
275
288
|
|
|
276
289
|
// shuffle([0,1,…,n-1])
|
|
277
|
-
fn shuffled_index(comptime n: usize, rand:
|
|
290
|
+
fn shuffled_index(comptime n: usize, rand: std.rand.Random) [n]usize {
|
|
278
291
|
var indices: [n]usize = undefined;
|
|
279
292
|
for (indices) |*i, j| i.* = j;
|
|
280
293
|
rand.shuffle(usize, indices[0..]);
|
|
@@ -307,7 +320,7 @@ fn binary_search_keys(
|
|
|
307
320
|
assert(keys.len == layout.keys_count);
|
|
308
321
|
assert(values.len == layout.values_count);
|
|
309
322
|
|
|
310
|
-
const key_index =
|
|
323
|
+
const key_index = binary_search_keys_raw(Key, compare_keys, keys, key, .{});
|
|
311
324
|
const key_stride = layout.values_count / layout.keys_count;
|
|
312
325
|
const high = key_index * key_stride;
|
|
313
326
|
if (key_index < keys.len and keys[key_index] == key) {
|
|
@@ -38,7 +38,10 @@ const Environment = struct {
|
|
|
38
38
|
const cluster = 32;
|
|
39
39
|
const replica = 4;
|
|
40
40
|
// TODO Is this appropriate for the number of fuzz_ops we want to run?
|
|
41
|
-
const size_max = vsr.Zone.superblock.size().? +
|
|
41
|
+
const size_max = vsr.Zone.superblock.size().? +
|
|
42
|
+
vsr.Zone.wal_headers.size().? +
|
|
43
|
+
vsr.Zone.wal_prepares.size().? +
|
|
44
|
+
1024 * 1024 * 1024;
|
|
42
45
|
|
|
43
46
|
const node_count = 1024;
|
|
44
47
|
// This is the smallest size that set_associative_cache will allow us.
|
|
@@ -80,6 +83,7 @@ const Environment = struct {
|
|
|
80
83
|
forest: Forest,
|
|
81
84
|
// We need @fieldParentPtr() of forest, so we can't use an optional Forest.
|
|
82
85
|
forest_exists: bool,
|
|
86
|
+
checkpoint_op: ?u64 = null,
|
|
83
87
|
|
|
84
88
|
fn init(env: *Environment, storage: *Storage) !void {
|
|
85
89
|
env.state = .uninit;
|
|
@@ -184,7 +188,8 @@ const Environment = struct {
|
|
|
184
188
|
env.change_state(.forest_compacting, .forest_open);
|
|
185
189
|
}
|
|
186
190
|
|
|
187
|
-
pub fn checkpoint(env: *Environment) void {
|
|
191
|
+
pub fn checkpoint(env: *Environment, op: u64) void {
|
|
192
|
+
env.checkpoint_op = op - config.lsm_batch_multiple;
|
|
188
193
|
env.change_state(.forest_open, .forest_checkpointing);
|
|
189
194
|
env.forest.checkpoint(forest_checkpoint_callback);
|
|
190
195
|
env.tick_until_state_change(.forest_checkpointing, .superblock_checkpointing);
|
|
@@ -194,7 +199,14 @@ const Environment = struct {
|
|
|
194
199
|
fn forest_checkpoint_callback(forest: *Forest) void {
|
|
195
200
|
const env = @fieldParentPtr(@This(), "forest", forest);
|
|
196
201
|
env.change_state(.forest_checkpointing, .superblock_checkpointing);
|
|
197
|
-
env.superblock.checkpoint(superblock_checkpoint_callback, &env.superblock_context
|
|
202
|
+
env.superblock.checkpoint(superblock_checkpoint_callback, &env.superblock_context, .{
|
|
203
|
+
.commit_min_checksum = env.superblock.working.vsr_state.commit_min_checksum + 1,
|
|
204
|
+
.commit_min = env.checkpoint_op.?,
|
|
205
|
+
.commit_max = env.checkpoint_op.? + 1,
|
|
206
|
+
.view_normal = 0,
|
|
207
|
+
.view = 0,
|
|
208
|
+
});
|
|
209
|
+
env.checkpoint_op = null;
|
|
198
210
|
}
|
|
199
211
|
|
|
200
212
|
fn superblock_checkpoint_callback(superblock_context: *SuperBlock.Context) void {
|
|
@@ -236,17 +248,17 @@ const Environment = struct {
|
|
|
236
248
|
|
|
237
249
|
for (fuzz_ops) |fuzz_op, fuzz_op_index| {
|
|
238
250
|
log.debug("Running fuzz_ops[{}/{}] == {}", .{ fuzz_op_index, fuzz_ops.len, fuzz_op });
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
251
|
+
const storage_size_used = storage.size_used();
|
|
252
|
+
log.debug("storage.size_used = {}/{}", .{ storage_size_used, storage.size });
|
|
253
|
+
const model_size = model.count() * @sizeOf(Account);
|
|
254
|
+
log.debug("space_amplification = {d:.2}", .{
|
|
255
|
+
@intToFloat(f64, storage_size_used) / @intToFloat(f64, model_size),
|
|
256
|
+
});
|
|
244
257
|
// Apply fuzz_op to the forest and the model.
|
|
245
258
|
switch (fuzz_op) {
|
|
246
259
|
.compact => |compact| {
|
|
247
260
|
env.compact(compact.op);
|
|
248
|
-
if (compact.checkpoint)
|
|
249
|
-
env.checkpoint();
|
|
261
|
+
if (compact.checkpoint) env.checkpoint(compact.op);
|
|
250
262
|
},
|
|
251
263
|
.put_account => |account| {
|
|
252
264
|
env.forest.grooves.accounts.put(&account);
|
|
@@ -274,27 +286,9 @@ const Environment = struct {
|
|
|
274
286
|
}
|
|
275
287
|
};
|
|
276
288
|
|
|
277
|
-
pub fn run_fuzz_ops(fuzz_ops: []const FuzzOp) !void {
|
|
289
|
+
pub fn run_fuzz_ops(storage_options: Storage.Options, fuzz_ops: []const FuzzOp) !void {
|
|
278
290
|
// Init mocked storage.
|
|
279
|
-
var storage = try Storage.init(
|
|
280
|
-
allocator,
|
|
281
|
-
Environment.size_max,
|
|
282
|
-
Storage.Options{
|
|
283
|
-
// We don't apply storage faults yet, so this seed doesn't matter.
|
|
284
|
-
.seed = 0xdeadbeef,
|
|
285
|
-
.read_latency_min = 0,
|
|
286
|
-
.read_latency_mean = 0,
|
|
287
|
-
.write_latency_min = 0,
|
|
288
|
-
.write_latency_mean = 0,
|
|
289
|
-
.read_fault_probability = 0,
|
|
290
|
-
.write_fault_probability = 0,
|
|
291
|
-
},
|
|
292
|
-
0,
|
|
293
|
-
.{
|
|
294
|
-
.first_offset = 0,
|
|
295
|
-
.period = 0,
|
|
296
|
-
},
|
|
297
|
-
);
|
|
291
|
+
var storage = try Storage.init(allocator, Environment.size_max, storage_options);
|
|
298
292
|
defer storage.deinit(allocator);
|
|
299
293
|
|
|
300
294
|
try Environment.format(&storage);
|
|
@@ -354,6 +348,7 @@ pub fn generate_fuzz_ops(random: std.rand.Random) ![]const FuzzOp {
|
|
|
354
348
|
const checkpoint =
|
|
355
349
|
// Can only checkpoint on the last beat of the bar.
|
|
356
350
|
compact_op % config.lsm_batch_multiple == config.lsm_batch_multiple - 1 and
|
|
351
|
+
compact_op > config.lsm_batch_multiple and
|
|
357
352
|
// Checkpoint at roughly the same rate as log wraparound.
|
|
358
353
|
random.uintLessThan(usize, Environment.compacts_per_checkpoint) == 0;
|
|
359
354
|
break :compact FuzzOp{
|
|
@@ -402,11 +397,18 @@ pub fn generate_fuzz_ops(random: std.rand.Random) ![]const FuzzOp {
|
|
|
402
397
|
pub fn main() !void {
|
|
403
398
|
const fuzz_args = try fuzz.parse_fuzz_args(allocator);
|
|
404
399
|
var rng = std.rand.DefaultPrng.init(fuzz_args.seed);
|
|
400
|
+
const random = rng.random();
|
|
405
401
|
|
|
406
|
-
const fuzz_ops = try generate_fuzz_ops(
|
|
402
|
+
const fuzz_ops = try generate_fuzz_ops(random);
|
|
407
403
|
defer allocator.free(fuzz_ops);
|
|
408
404
|
|
|
409
|
-
try run_fuzz_ops(
|
|
405
|
+
try run_fuzz_ops(Storage.Options{
|
|
406
|
+
.seed = random.int(u64),
|
|
407
|
+
.read_latency_min = 0,
|
|
408
|
+
.read_latency_mean = 0 + fuzz.random_int_exponential(random, u64, 20),
|
|
409
|
+
.write_latency_min = 0,
|
|
410
|
+
.write_latency_mean = 0 + fuzz.random_int_exponential(random, u64, 20),
|
|
411
|
+
}, fuzz_ops);
|
|
410
412
|
|
|
411
413
|
log.info("Passed!", .{});
|
|
412
414
|
}
|