tigerbeetle-node 0.10.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +302 -101
- package/dist/index.d.ts +70 -72
- package/dist/index.js +70 -72
- package/dist/index.js.map +1 -1
- package/package.json +9 -8
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +6 -10
- package/src/node.zig +6 -3
- package/src/tigerbeetle/scripts/benchmark.sh +4 -4
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +19 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +24 -14
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
- package/src/tigerbeetle/src/benchmark.zig +29 -13
- package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
- package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
- package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -257
- package/src/tigerbeetle/src/c/tb_client.h +118 -84
- package/src/tigerbeetle/src/c/tb_client.zig +88 -23
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
- package/src/tigerbeetle/src/c/test.zig +371 -1
- package/src/tigerbeetle/src/cli.zig +37 -7
- package/src/tigerbeetle/src/config.zig +58 -17
- package/src/tigerbeetle/src/demo.zig +5 -2
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
- package/src/tigerbeetle/src/ewah.zig +11 -33
- package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
- package/src/tigerbeetle/src/io/linux.zig +1 -1
- package/src/tigerbeetle/src/lsm/README.md +308 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +376 -397
- package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
- package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
- package/src/tigerbeetle/src/lsm/forest.zig +21 -447
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +414 -0
- package/src/tigerbeetle/src/lsm/grid.zig +170 -76
- package/src/tigerbeetle/src/lsm/groove.zig +197 -133
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
- package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
- package/src/tigerbeetle/src/lsm/manifest.zig +93 -180
- package/src/tigerbeetle/src/lsm/manifest_level.zig +161 -454
- package/src/tigerbeetle/src/lsm/manifest_log.zig +243 -356
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +65 -76
- package/src/tigerbeetle/src/lsm/segmented_array.zig +580 -251
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
- package/src/tigerbeetle/src/lsm/table.zig +115 -68
- package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
- package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -17
- package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
- package/src/tigerbeetle/src/lsm/test.zig +61 -56
- package/src/tigerbeetle/src/lsm/tree.zig +450 -407
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +461 -0
- package/src/tigerbeetle/src/main.zig +83 -8
- package/src/tigerbeetle/src/message_bus.zig +20 -9
- package/src/tigerbeetle/src/message_pool.zig +22 -19
- package/src/tigerbeetle/src/ring_buffer.zig +7 -3
- package/src/tigerbeetle/src/simulator.zig +179 -119
- package/src/tigerbeetle/src/state_machine.zig +381 -246
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +3 -7
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +823 -0
- package/src/tigerbeetle/src/test/cluster.zig +33 -81
- package/src/tigerbeetle/src/test/conductor.zig +366 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/network.zig +45 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +91 -69
- package/src/tigerbeetle/src/test/state_machine.zig +11 -35
- package/src/tigerbeetle/src/test/storage.zig +470 -106
- package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +15 -16
- package/src/tigerbeetle/src/unit_tests.zig +13 -1
- package/src/tigerbeetle/src/util.zig +97 -11
- package/src/tigerbeetle/src/vopr.zig +495 -0
- package/src/tigerbeetle/src/vsr/client.zig +21 -3
- package/src/tigerbeetle/src/vsr/journal.zig +293 -212
- package/src/tigerbeetle/src/vsr/replica.zig +1086 -515
- package/src/tigerbeetle/src/vsr/superblock.zig +382 -637
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +14 -16
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +416 -153
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +62 -12
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
- package/src/tigerbeetle/src/vsr.zig +94 -60
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
- package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
- package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
|
@@ -1,8 +1,41 @@
|
|
|
1
|
+
//! Compaction moves or merges a table's values into the next level.
|
|
2
|
+
//!
|
|
3
|
+
//! Each Compaction is paced to run in one half-bar.
|
|
4
|
+
//!
|
|
5
|
+
//!
|
|
6
|
+
//! Compaction overview:
|
|
7
|
+
//!
|
|
8
|
+
//! 1. Given:
|
|
9
|
+
//!
|
|
10
|
+
//! - levels A and B, where A+1=B
|
|
11
|
+
//! - a single table in level A ("table A")
|
|
12
|
+
//! - all tables from level B which intersect table A's key range ("tables B")
|
|
13
|
+
//! (This can include anything between 0 tables and all of level B's tables.)
|
|
14
|
+
//!
|
|
15
|
+
//! 2. If table A's key range is disjoint from the keys in level B, move table A into level B.
|
|
16
|
+
//! All done! (But if the key ranges intersect, jump to step 3).
|
|
17
|
+
//!
|
|
18
|
+
//! 3. Create an iterator from the sort-merge of table A and the concatenation of tables B.
|
|
19
|
+
//! If the same key exists in level A and B, take A's and discard B's. †
|
|
20
|
+
//!
|
|
21
|
+
//! 4. Write the sort-merge iterator into a sequence of new tables on disk.
|
|
22
|
+
//!
|
|
23
|
+
//! 5. Update the input tables in the Manifest with their new `snapshot_max` so that they become
|
|
24
|
+
//! invisible to subsequent read transactions.
|
|
25
|
+
//!
|
|
26
|
+
//! 6. Insert the new level-B tables into the Manifest.
|
|
27
|
+
//!
|
|
28
|
+
//! † When A's value is a tombstone, there is a special case for garbage collection. When either:
|
|
29
|
+
//! * level B is the final level, or
|
|
30
|
+
//! * A's key does not exist in B or any deeper level,
|
|
31
|
+
//! then the tombstone is omitted from the compacted output (see: `compaction_must_drop_tombstones`).
|
|
32
|
+
//!
|
|
1
33
|
const std = @import("std");
|
|
2
34
|
const mem = std.mem;
|
|
3
35
|
const math = std.math;
|
|
4
36
|
const assert = std.debug.assert;
|
|
5
37
|
|
|
38
|
+
const log = std.log.scoped(.compaction);
|
|
6
39
|
const config = @import("../config.zig");
|
|
7
40
|
|
|
8
41
|
const GridType = @import("grid.zig").GridType;
|
|
@@ -14,84 +47,113 @@ const LevelIteratorType = @import("level_iterator.zig").LevelIteratorType;
|
|
|
14
47
|
pub fn CompactionType(
|
|
15
48
|
comptime Table: type,
|
|
16
49
|
comptime Storage: type,
|
|
17
|
-
comptime IteratorAType: anytype,
|
|
50
|
+
comptime IteratorAType: anytype,
|
|
18
51
|
) type {
|
|
19
52
|
const Key = Table.Key;
|
|
20
53
|
const Value = Table.Value;
|
|
21
54
|
const tombstone = Table.tombstone;
|
|
22
|
-
const compare_keys = Table.compare_keys;
|
|
23
55
|
|
|
24
56
|
return struct {
|
|
25
57
|
const Compaction = @This();
|
|
26
58
|
|
|
27
59
|
const Grid = GridType(Storage);
|
|
28
60
|
const BlockPtr = Grid.BlockPtr;
|
|
61
|
+
const BlockPtrConst = Grid.BlockPtrConst;
|
|
62
|
+
const BlockWrite = struct {
|
|
63
|
+
write: Grid.Write = undefined,
|
|
64
|
+
block: BlockPtr = undefined,
|
|
65
|
+
writable: bool = false,
|
|
66
|
+
};
|
|
67
|
+
|
|
29
68
|
const Manifest = ManifestType(Table, Storage);
|
|
30
69
|
const TableInfo = Manifest.TableInfo;
|
|
31
70
|
|
|
32
71
|
const IteratorA = IteratorAType(Table, Storage);
|
|
33
72
|
const IteratorB = LevelIteratorType(Table, Storage);
|
|
34
73
|
|
|
35
|
-
pub const Callback = fn (it: *Compaction) void;
|
|
36
|
-
|
|
37
74
|
const k = 2;
|
|
38
75
|
const MergeIterator = KWayMergeIterator(
|
|
39
76
|
Compaction,
|
|
40
|
-
Key,
|
|
41
|
-
Value,
|
|
77
|
+
Table.Key,
|
|
78
|
+
Table.Value,
|
|
42
79
|
Table.key_from_value,
|
|
43
80
|
Table.compare_keys,
|
|
44
81
|
k,
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
82
|
+
MergeStreamSelector.peek,
|
|
83
|
+
MergeStreamSelector.pop,
|
|
84
|
+
MergeStreamSelector.precedence,
|
|
48
85
|
);
|
|
49
86
|
|
|
50
|
-
const
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
87
|
+
const MergeStreamSelector = struct {
|
|
88
|
+
fn peek(compaction: *const Compaction, stream_id: u32) error{ Empty, Drained }!Key {
|
|
89
|
+
return switch (stream_id) {
|
|
90
|
+
0 => compaction.iterator_a.peek(),
|
|
91
|
+
1 => compaction.iterator_b.peek(),
|
|
92
|
+
else => unreachable,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
55
95
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
96
|
+
fn pop(compaction: *Compaction, stream_id: u32) Value {
|
|
97
|
+
return switch (stream_id) {
|
|
98
|
+
0 => compaction.iterator_a.pop(),
|
|
99
|
+
1 => compaction.iterator_b.pop(),
|
|
100
|
+
else => unreachable,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/// Returns true if stream A has higher precedence than stream B.
|
|
105
|
+
/// This is used to deduplicate values across streams.
|
|
106
|
+
fn precedence(compaction: *const Compaction, stream_a: u32, stream_b: u32) bool {
|
|
107
|
+
_ = compaction;
|
|
108
|
+
assert(stream_a + stream_b == 1);
|
|
109
|
+
|
|
110
|
+
// All tables in iterator_a (stream=0) have a higher precedence.
|
|
111
|
+
return stream_a == 0;
|
|
112
|
+
}
|
|
61
113
|
};
|
|
62
114
|
|
|
63
|
-
const
|
|
115
|
+
pub const Callback = fn (it: *Compaction) void;
|
|
64
116
|
|
|
65
|
-
|
|
117
|
+
const Status = enum {
|
|
118
|
+
idle,
|
|
119
|
+
processing,
|
|
120
|
+
done,
|
|
121
|
+
};
|
|
66
122
|
|
|
67
123
|
grid: *Grid,
|
|
68
|
-
|
|
69
|
-
level_b: u8,
|
|
124
|
+
grid_reservation: Grid.Reservation,
|
|
70
125
|
range: Manifest.CompactionRange,
|
|
71
|
-
|
|
126
|
+
|
|
127
|
+
/// `op_min` is the first op/beat of this compaction's half-bar.
|
|
128
|
+
/// `op_min` is used as a snapshot — the compaction's input tables must be visible
|
|
129
|
+
/// to `op_min`.
|
|
130
|
+
///
|
|
131
|
+
/// After this compaction finishes:
|
|
132
|
+
/// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
|
|
133
|
+
/// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
|
|
134
|
+
op_min: u64,
|
|
72
135
|
drop_tombstones: bool,
|
|
73
136
|
|
|
137
|
+
status: Status,
|
|
74
138
|
callback: ?Callback = null,
|
|
75
|
-
ticks: u32 = 0,
|
|
76
139
|
io_pending: u32 = 0,
|
|
77
140
|
|
|
78
141
|
iterator_a: IteratorA,
|
|
79
142
|
iterator_b: IteratorB,
|
|
80
143
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
/// because a write I/O may yet follow even after the merge is done.
|
|
84
|
-
merge_done: bool = false,
|
|
85
|
-
merge_iterator: MergeIterator,
|
|
86
|
-
table_builder: Table.Builder,
|
|
144
|
+
merge_done: bool,
|
|
145
|
+
merge_iterator: ?MergeIterator,
|
|
87
146
|
|
|
147
|
+
table_builder: Table.Builder,
|
|
88
148
|
index: BlockWrite,
|
|
89
149
|
filter: BlockWrite,
|
|
90
150
|
data: BlockWrite,
|
|
91
151
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
152
|
+
manifest: *Manifest,
|
|
153
|
+
level_b: u8,
|
|
154
|
+
level_a_input: ?TableInfo,
|
|
155
|
+
|
|
156
|
+
tables_output_count: usize = 0,
|
|
95
157
|
|
|
96
158
|
pub fn init(allocator: mem.Allocator) !Compaction {
|
|
97
159
|
var iterator_a = try IteratorA.init(allocator);
|
|
@@ -103,162 +165,137 @@ pub fn CompactionType(
|
|
|
103
165
|
var table_builder = try Table.Builder.init(allocator);
|
|
104
166
|
errdefer table_builder.deinit(allocator);
|
|
105
167
|
|
|
106
|
-
const index = BlockWrite{ .block = try allocate_block(allocator) };
|
|
107
|
-
errdefer allocator.free(index.block);
|
|
108
|
-
|
|
109
|
-
const filter = BlockWrite{ .block = try allocate_block(allocator) };
|
|
110
|
-
errdefer allocator.free(filter.block);
|
|
111
|
-
|
|
112
|
-
const data = BlockWrite{ .block = try allocate_block(allocator) };
|
|
113
|
-
errdefer allocator.free(data.block);
|
|
114
|
-
|
|
115
|
-
// The average number of tables involved in a compaction is the 1 table from level A,
|
|
116
|
-
// plus the growth_factor number of tables from level B, plus 1 on either side,
|
|
117
|
-
// since the overlap may not be perfectly aligned to table boundaries.
|
|
118
|
-
// However, the worst case number of tables may approach all tables in level B,
|
|
119
|
-
// since key ranges may be skewed and not evenly distributed across a level.
|
|
120
|
-
const table_buffer_count_max = 1 + config.lsm_growth_factor + 2;
|
|
121
|
-
|
|
122
|
-
var update_level_b = try TableInfoBuffer.init(allocator, table_buffer_count_max);
|
|
123
|
-
errdefer update_level_b.deinit(allocator);
|
|
124
|
-
|
|
125
|
-
var insert_level_b = try TableInfoBuffer.init(allocator, table_buffer_count_max);
|
|
126
|
-
errdefer insert_level_b.deinit(allocator);
|
|
127
|
-
|
|
128
168
|
return Compaction{
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
// Assigned by start():
|
|
169
|
+
// Assigned by start()
|
|
132
170
|
.grid = undefined,
|
|
133
|
-
.
|
|
134
|
-
.level_b = undefined,
|
|
171
|
+
.grid_reservation = undefined,
|
|
135
172
|
.range = undefined,
|
|
136
|
-
.
|
|
173
|
+
.op_min = undefined,
|
|
137
174
|
.drop_tombstones = undefined,
|
|
138
175
|
|
|
176
|
+
.status = .idle,
|
|
139
177
|
.iterator_a = iterator_a,
|
|
140
178
|
.iterator_b = iterator_b,
|
|
141
179
|
|
|
142
|
-
.
|
|
143
|
-
.
|
|
180
|
+
.merge_done = false,
|
|
181
|
+
.merge_iterator = null,
|
|
144
182
|
|
|
145
|
-
.
|
|
146
|
-
.
|
|
147
|
-
.
|
|
183
|
+
.table_builder = table_builder,
|
|
184
|
+
.index = .{},
|
|
185
|
+
.filter = .{},
|
|
186
|
+
.data = .{},
|
|
148
187
|
|
|
149
|
-
|
|
150
|
-
.
|
|
188
|
+
// Assigned by start()
|
|
189
|
+
.manifest = undefined,
|
|
190
|
+
.level_b = undefined,
|
|
191
|
+
.level_a_input = null,
|
|
151
192
|
};
|
|
152
193
|
}
|
|
153
194
|
|
|
154
|
-
fn allocate_block(allocator: mem.Allocator) !BlockPtr {
|
|
155
|
-
const block = try allocator.alignedAlloc(u8, config.sector_size, config.block_size);
|
|
156
|
-
return block[0..config.block_size];
|
|
157
|
-
}
|
|
158
|
-
|
|
159
195
|
pub fn deinit(compaction: *Compaction, allocator: mem.Allocator) void {
|
|
160
|
-
compaction.iterator_a.deinit(allocator);
|
|
161
|
-
compaction.iterator_b.deinit(allocator);
|
|
162
196
|
compaction.table_builder.deinit(allocator);
|
|
163
|
-
compaction.update_level_b.deinit(allocator);
|
|
164
|
-
compaction.insert_level_b.deinit(allocator);
|
|
165
197
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
allocator.free(compaction.data.block);
|
|
198
|
+
compaction.iterator_b.deinit(allocator);
|
|
199
|
+
compaction.iterator_a.deinit(allocator);
|
|
169
200
|
}
|
|
170
201
|
|
|
202
|
+
/// The compaction's input tables are:
|
|
203
|
+
/// * table_a (which is null when level B is 0), and
|
|
204
|
+
/// * any level-B tables visible to `op_min` within `range`.
|
|
171
205
|
pub fn start(
|
|
172
206
|
compaction: *Compaction,
|
|
173
207
|
grid: *Grid,
|
|
174
208
|
manifest: *Manifest,
|
|
175
|
-
|
|
176
|
-
level_b: u8,
|
|
209
|
+
op_min: u64,
|
|
177
210
|
range: Manifest.CompactionRange,
|
|
178
|
-
|
|
211
|
+
table_a: ?*const TableInfo,
|
|
212
|
+
level_b: u8,
|
|
179
213
|
iterator_a_context: IteratorA.Context,
|
|
180
214
|
) void {
|
|
181
215
|
assert(compaction.status == .idle);
|
|
182
216
|
assert(compaction.callback == null);
|
|
183
217
|
assert(compaction.io_pending == 0);
|
|
184
|
-
assert(
|
|
218
|
+
assert(!compaction.merge_done and compaction.merge_iterator == null);
|
|
219
|
+
|
|
220
|
+
assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
|
|
185
221
|
assert(range.table_count > 0);
|
|
222
|
+
if (table_a) |t| assert(t.visible(op_min));
|
|
223
|
+
|
|
224
|
+
assert(level_b < config.lsm_levels);
|
|
225
|
+
assert((level_b == 0) == (table_a == null));
|
|
186
226
|
|
|
227
|
+
// Levels may choose to drop tombstones if keys aren't included in the lower levels.
|
|
228
|
+
// This invariant is always true for the last level as it doesn't have any lower ones.
|
|
187
229
|
const drop_tombstones = manifest.compaction_must_drop_tombstones(level_b, range);
|
|
188
230
|
assert(drop_tombstones or level_b < config.lsm_levels - 1);
|
|
189
231
|
|
|
190
232
|
compaction.* = .{
|
|
191
|
-
.status = .compacting,
|
|
192
|
-
|
|
193
233
|
.grid = grid,
|
|
194
|
-
|
|
195
|
-
|
|
234
|
+
// Reserve enough blocks to write our output tables in the worst case, where:
|
|
235
|
+
// - no tombstones are dropped,
|
|
236
|
+
// - no values are overwritten,
|
|
237
|
+
// - and all tables are full.
|
|
238
|
+
//
|
|
239
|
+
// We must reserve before doing any async work so that the block acquisition order
|
|
240
|
+
// is deterministic (relative to other concurrent compactions).
|
|
241
|
+
// TODO The replica must stop accepting requests if it runs out of blocks/capacity,
|
|
242
|
+
// rather than panicking here.
|
|
243
|
+
// TODO(Compaction Pacing): Reserve smaller increments, at the start of each beat.
|
|
244
|
+
// (And likewise release the reservation at the end of each beat, instead of at the
|
|
245
|
+
// end of each half-bar).
|
|
246
|
+
// TODO(Move Table) Don't reserve these when we just move the table to the next level.
|
|
247
|
+
.grid_reservation = grid.reserve(range.table_count * Table.block_count_max).?,
|
|
196
248
|
.range = range,
|
|
197
|
-
.
|
|
249
|
+
.op_min = op_min,
|
|
198
250
|
.drop_tombstones = drop_tombstones,
|
|
199
251
|
|
|
252
|
+
.status = .processing,
|
|
200
253
|
.iterator_a = compaction.iterator_a,
|
|
201
254
|
.iterator_b = compaction.iterator_b,
|
|
202
255
|
|
|
203
|
-
.
|
|
204
|
-
.
|
|
256
|
+
.merge_done = false,
|
|
257
|
+
.merge_iterator = null,
|
|
205
258
|
|
|
259
|
+
.table_builder = compaction.table_builder,
|
|
206
260
|
.index = compaction.index,
|
|
207
261
|
.filter = compaction.filter,
|
|
208
262
|
.data = compaction.data,
|
|
209
263
|
|
|
210
|
-
.
|
|
211
|
-
.
|
|
264
|
+
.manifest = manifest,
|
|
265
|
+
.level_b = level_b,
|
|
266
|
+
.level_a_input = if (table_a) |table| table.* else null,
|
|
212
267
|
};
|
|
213
268
|
|
|
214
|
-
assert(!compaction.
|
|
215
|
-
assert(!compaction.filter.
|
|
216
|
-
assert(!compaction.
|
|
217
|
-
|
|
218
|
-
// TODO
|
|
219
|
-
|
|
220
|
-
// TODO: Enable when move_table() can fetch TableInfo from address/checksum.
|
|
221
|
-
//
|
|
222
|
-
// Perform a "compaction move" to the next level inline if certain factors allow:
|
|
223
|
-
// - Can only do the specialization if there's a single table to compact.
|
|
224
|
-
// - Must be compacting from a table iterator which has an address and checksum.
|
|
225
|
-
// - Cannot drop tombstones as then we have to go through the normal compaction path.
|
|
226
|
-
// - Cannot be performing the immutable table -> level 0 compaction
|
|
227
|
-
// as it requires the table being moved to reside on disk (tracked by manifest).
|
|
228
|
-
if (false and IteratorA.Context == TableIteratorType(Table, Storage)) {
|
|
229
|
-
if (!drop_tombstones and range.table_count == 1) {
|
|
230
|
-
assert(compaction.level_b != 0);
|
|
231
|
-
assert(compaction.status == .compacting);
|
|
232
|
-
|
|
233
|
-
const level_a = level_b - 1;
|
|
234
|
-
assert(level_a < config.lsm_levels - 1);
|
|
235
|
-
|
|
236
|
-
compaction.manifest.move_table(
|
|
237
|
-
level_a,
|
|
238
|
-
level_b,
|
|
239
|
-
snapshot,
|
|
240
|
-
iterator_a_context.address,
|
|
241
|
-
iterator_a_context.checksum,
|
|
242
|
-
);
|
|
243
|
-
|
|
244
|
-
compaction.status = .done;
|
|
245
|
-
return;
|
|
246
|
-
}
|
|
247
|
-
}
|
|
269
|
+
assert(!compaction.index.writable);
|
|
270
|
+
assert(!compaction.filter.writable);
|
|
271
|
+
assert(!compaction.data.writable);
|
|
272
|
+
|
|
273
|
+
// TODO Implement manifest.move_table() optimization if there's only range.table_count == 1.
|
|
274
|
+
// This would do update_tables + insert_tables inline without going through the iterators.
|
|
248
275
|
|
|
249
276
|
const iterator_b_context = .{
|
|
250
277
|
.grid = grid,
|
|
251
278
|
.manifest = manifest,
|
|
252
279
|
.level = level_b,
|
|
253
|
-
.snapshot =
|
|
280
|
+
.snapshot = op_min,
|
|
254
281
|
.key_min = range.key_min,
|
|
255
282
|
.key_max = range.key_max,
|
|
256
283
|
.direction = .ascending,
|
|
257
|
-
.table_info_callback = iterator_b_table_info_callback,
|
|
284
|
+
.table_info_callback = iterator_b_table_info_callback,
|
|
258
285
|
};
|
|
259
286
|
|
|
260
|
-
compaction.iterator_a.start(iterator_a_context,
|
|
261
|
-
compaction.iterator_b.start(iterator_b_context,
|
|
287
|
+
compaction.iterator_a.start(iterator_a_context, iterator_a_io_callback);
|
|
288
|
+
compaction.iterator_b.start(iterator_b_context, iterator_b_io_callback);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
fn iterator_a_io_callback(iterator_a: *IteratorA) void {
|
|
292
|
+
const compaction = @fieldParentPtr(Compaction, "iterator_a", iterator_a);
|
|
293
|
+
compaction.io_finish();
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
fn iterator_b_io_callback(iterator_b: *IteratorB) void {
|
|
297
|
+
const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
|
|
298
|
+
compaction.io_finish();
|
|
262
299
|
}
|
|
263
300
|
|
|
264
301
|
fn iterator_b_table_info_callback(
|
|
@@ -267,337 +304,279 @@ pub fn CompactionType(
|
|
|
267
304
|
index_block: BlockPtrConst,
|
|
268
305
|
) void {
|
|
269
306
|
const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
buffer: *TableInfoBuffer,
|
|
289
|
-
table: *const TableInfo,
|
|
290
|
-
) void {
|
|
291
|
-
assert(buffer == &compaction.update_level_b or buffer == &compaction.insert_level_b);
|
|
292
|
-
if (buffer.full()) compaction.update_manifest(buffer);
|
|
293
|
-
buffer.push(table);
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
fn update_manifest(compaction: *Compaction, buffer: *TableInfoBuffer) void {
|
|
297
|
-
assert(buffer == &compaction.update_level_b or buffer == &compaction.insert_level_b);
|
|
298
|
-
|
|
299
|
-
const tables: []const TableInfo = buffer.drain();
|
|
300
|
-
if (tables.len == 0) return;
|
|
301
|
-
|
|
302
|
-
for (tables) |table| {
|
|
303
|
-
assert(compare_keys(table.key_min, compaction.range.key_min) != .lt);
|
|
304
|
-
assert(compare_keys(table.key_max, compaction.range.key_max) != .gt);
|
|
307
|
+
assert(compaction.status == .processing);
|
|
308
|
+
assert(compaction.callback != null);
|
|
309
|
+
assert(!compaction.merge_done);
|
|
310
|
+
assert(table.visible(compaction.op_min));
|
|
311
|
+
|
|
312
|
+
// Tables discovered by iterator_b that are visible at the start of compaction.
|
|
313
|
+
var table_copy = table.*;
|
|
314
|
+
compaction.manifest.update_table(
|
|
315
|
+
compaction.level_b,
|
|
316
|
+
snapshot_max_for_table_input(compaction.op_min),
|
|
317
|
+
&table_copy,
|
|
318
|
+
);
|
|
319
|
+
|
|
320
|
+
// Release the table's block addresses in the Grid as it will be made invisible.
|
|
321
|
+
// This is safe; iterator_b makes a copy of the block before calling us.
|
|
322
|
+
const grid = compaction.grid;
|
|
323
|
+
for (Table.index_data_addresses_used(index_block)) |address| {
|
|
324
|
+
grid.release(address);
|
|
305
325
|
}
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
compaction.manifest.update_tables(compaction.level_b, compaction.snapshot, tables);
|
|
309
|
-
} else {
|
|
310
|
-
compaction.manifest.insert_tables(compaction.level_b, tables);
|
|
326
|
+
for (Table.index_filter_addresses_used(index_block)) |address| {
|
|
327
|
+
grid.release(address);
|
|
311
328
|
}
|
|
329
|
+
grid.release(Table.index_block_address(index_block));
|
|
312
330
|
}
|
|
313
331
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
const pipeline_tick_merge = 1;
|
|
317
|
-
const pipeline_tick_write = 2;
|
|
318
|
-
|
|
319
|
-
/// Submits all read/write I/O before starting the CPU-intensive k-way merge.
|
|
320
|
-
/// This allows the I/O to happen in parallel with the merge.
|
|
321
|
-
///
|
|
322
|
-
/// The caller must call:
|
|
323
|
-
///
|
|
324
|
-
/// 1. tick_io() across all trees,
|
|
325
|
-
/// 2. IO.tick() to submit these I/O operations to the kernel,
|
|
326
|
-
/// 3. tick_cpu() across all trees.
|
|
327
|
-
pub fn tick_io(compaction: *Compaction, callback: Callback) void {
|
|
328
|
-
assert(compaction.status == .compacting);
|
|
332
|
+
pub fn compact_tick(compaction: *Compaction, callback: Callback) void {
|
|
333
|
+
assert(compaction.status == .processing);
|
|
329
334
|
assert(compaction.callback == null);
|
|
330
335
|
assert(compaction.io_pending == 0);
|
|
331
336
|
assert(!compaction.merge_done);
|
|
332
337
|
|
|
333
338
|
compaction.callback = callback;
|
|
334
339
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
//
|
|
340
|
+
// Generate fake IO to make sure io_pending doesn't reach zero multiple times from
|
|
341
|
+
// IO being completed inline down below.
|
|
342
|
+
// The fake IO is immediately resolved and triggers the cpu_merge_start if all
|
|
343
|
+
// IO completes inline or if no IO was started.
|
|
344
|
+
compaction.io_start();
|
|
345
|
+
defer compaction.io_finish();
|
|
346
|
+
|
|
347
|
+
// Start reading blocks from the iterators to merge them.
|
|
348
|
+
if (compaction.iterator_a.tick()) compaction.io_start();
|
|
349
|
+
if (compaction.iterator_b.tick()) compaction.io_start();
|
|
350
|
+
|
|
351
|
+
// Start writing blocks prepared by the merge iterator from a previous compact_tick().
|
|
352
|
+
compaction.io_write_start(.data);
|
|
353
|
+
compaction.io_write_start(.filter);
|
|
354
|
+
compaction.io_write_start(.index);
|
|
339
355
|
}
|
|
340
356
|
|
|
341
|
-
|
|
342
|
-
assert(compaction.status == .compacting);
|
|
343
|
-
assert(compaction.callback != null);
|
|
344
|
-
assert(compaction.io_pending >= 0);
|
|
345
|
-
assert(!compaction.merge_done);
|
|
357
|
+
const BlockWriteField = enum { data, filter, index };
|
|
346
358
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
if (compaction.ticks >= pipeline_tick_merge) {
|
|
354
|
-
if (compaction.merge_iterator.empty()) {
|
|
355
|
-
assert(!compaction.merge_done);
|
|
359
|
+
fn io_write_start(compaction: *Compaction, comptime field: BlockWriteField) void {
|
|
360
|
+
const write_callback = struct {
|
|
361
|
+
fn callback(write: *Grid.Write) void {
|
|
362
|
+
const block_write = @fieldParentPtr(BlockWrite, "write", write);
|
|
363
|
+
block_write.block = undefined;
|
|
356
364
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
compaction.merge_done = true;
|
|
360
|
-
} else {
|
|
361
|
-
compaction.tick_cpu_merge();
|
|
365
|
+
const _compaction = @fieldParentPtr(Compaction, @tagName(field), block_write);
|
|
366
|
+
_compaction.io_finish();
|
|
362
367
|
}
|
|
363
|
-
}
|
|
368
|
+
}.callback;
|
|
364
369
|
|
|
365
|
-
compaction
|
|
370
|
+
const block_write: *BlockWrite = &@field(compaction, @tagName(field));
|
|
371
|
+
if (block_write.writable) {
|
|
372
|
+
block_write.writable = false;
|
|
366
373
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
374
|
+
compaction.io_start();
|
|
375
|
+
compaction.grid.write_block(
|
|
376
|
+
write_callback,
|
|
377
|
+
&block_write.write,
|
|
378
|
+
block_write.block,
|
|
379
|
+
Table.block_address(block_write.block),
|
|
380
|
+
);
|
|
381
|
+
}
|
|
370
382
|
}
|
|
371
383
|
|
|
372
|
-
fn
|
|
373
|
-
assert(compaction.status == .
|
|
384
|
+
fn io_start(compaction: *Compaction) void {
|
|
385
|
+
assert(compaction.status == .processing);
|
|
374
386
|
assert(compaction.callback != null);
|
|
375
|
-
assert(compaction.
|
|
376
|
-
|
|
377
|
-
// Consume the callback and invoke it one finished updating state below.
|
|
378
|
-
const callback = compaction.callback.?;
|
|
379
|
-
compaction.callback = null;
|
|
380
|
-
defer callback(compaction);
|
|
381
|
-
|
|
382
|
-
// Once merge completes, the compaction is now officially over.
|
|
383
|
-
if (compaction.merge_done) {
|
|
384
|
-
compaction.status = .done;
|
|
387
|
+
assert(!compaction.merge_done);
|
|
385
388
|
|
|
386
|
-
|
|
387
|
-
// TODO Handle compaction.remove_level_a
|
|
388
|
-
compaction.update_manifest(&compaction.update_level_b);
|
|
389
|
-
compaction.update_manifest(&compaction.insert_level_b);
|
|
390
|
-
}
|
|
389
|
+
compaction.io_pending += 1;
|
|
391
390
|
}
|
|
392
391
|
|
|
393
|
-
|
|
394
|
-
assert(compaction.
|
|
395
|
-
assert(compaction.
|
|
396
|
-
|
|
397
|
-
assert(compaction.
|
|
398
|
-
compaction.status = .idle;
|
|
392
|
+
fn io_finish(compaction: *Compaction) void {
|
|
393
|
+
assert(compaction.status == .processing);
|
|
394
|
+
assert(compaction.callback != null);
|
|
395
|
+
assert(compaction.io_pending > 0);
|
|
396
|
+
assert(!compaction.merge_done);
|
|
399
397
|
|
|
400
|
-
|
|
401
|
-
|
|
398
|
+
compaction.io_pending -= 1;
|
|
399
|
+
if (compaction.io_pending == 0) compaction.cpu_merge_start();
|
|
402
400
|
}
|
|
403
401
|
|
|
404
|
-
fn
|
|
402
|
+
fn cpu_merge_start(compaction: *Compaction) void {
|
|
403
|
+
assert(compaction.status == .processing);
|
|
405
404
|
assert(compaction.callback != null);
|
|
405
|
+
assert(compaction.io_pending == 0);
|
|
406
|
+
assert(!compaction.merge_done);
|
|
406
407
|
|
|
407
|
-
|
|
408
|
-
|
|
408
|
+
// Create the merge iterator only when we can peek() from the read iterators.
|
|
409
|
+
// This happens after IO for the first reads complete.
|
|
410
|
+
if (compaction.merge_iterator == null) {
|
|
411
|
+
compaction.merge_iterator = MergeIterator.init(compaction, k, .ascending);
|
|
412
|
+
assert(!compaction.merge_iterator.?.empty());
|
|
413
|
+
}
|
|
409
414
|
|
|
410
|
-
|
|
411
|
-
|
|
415
|
+
assert(!compaction.data.writable);
|
|
416
|
+
assert(!compaction.filter.writable);
|
|
417
|
+
assert(!compaction.index.writable);
|
|
412
418
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
419
|
+
if (!compaction.merge_iterator.?.empty()) {
|
|
420
|
+
compaction.cpu_merge();
|
|
421
|
+
} else {
|
|
422
|
+
compaction.cpu_merge_finish();
|
|
423
|
+
}
|
|
418
424
|
|
|
419
|
-
|
|
420
|
-
compaction.
|
|
421
|
-
compaction.write_block_if_ready(&compaction.index, write_block_callback("index"));
|
|
425
|
+
// TODO Implement pacing here by deciding if we should do another compact_tick()
|
|
426
|
+
// instead of invoking the callback, using compaction.range.table_count as the heuristic.
|
|
422
427
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
428
|
+
const callback = compaction.callback.?;
|
|
429
|
+
compaction.callback = null;
|
|
430
|
+
callback(compaction);
|
|
426
431
|
}
|
|
427
432
|
|
|
428
|
-
fn
|
|
433
|
+
fn cpu_merge(compaction: *Compaction) void {
|
|
434
|
+
// Ensure this is the result of a compact_tick() call that finished processing IO.
|
|
435
|
+
assert(compaction.status == .processing);
|
|
429
436
|
assert(compaction.callback != null);
|
|
430
|
-
assert(compaction.
|
|
437
|
+
assert(compaction.io_pending == 0);
|
|
431
438
|
assert(!compaction.merge_done);
|
|
432
|
-
assert(!compaction.merge_iterator.empty());
|
|
433
439
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
assert(!
|
|
440
|
+
// Ensure there are values to merge and that is it safe to do so.
|
|
441
|
+
const merge_iterator = &compaction.merge_iterator.?;
|
|
442
|
+
assert(!merge_iterator.empty());
|
|
443
|
+
assert(!compaction.data.writable);
|
|
444
|
+
assert(!compaction.filter.writable);
|
|
445
|
+
assert(!compaction.index.writable);
|
|
437
446
|
|
|
438
|
-
|
|
447
|
+
// Build up a data block with values merged from the read iterators.
|
|
448
|
+
// This skips tombstone values if compaction was started with the intent to drop them.
|
|
439
449
|
while (!compaction.table_builder.data_block_full()) {
|
|
440
|
-
const value =
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
};
|
|
444
|
-
if (compaction.drop_tombstones and tombstone(&value)) {
|
|
445
|
-
tombstones_dropped += 1;
|
|
446
|
-
} else {
|
|
447
|
-
compaction.table_builder.data_block_append(&value);
|
|
448
|
-
}
|
|
450
|
+
const value = merge_iterator.pop() orelse break;
|
|
451
|
+
if (compaction.drop_tombstones and tombstone(&value)) continue;
|
|
452
|
+
compaction.table_builder.data_block_append(&value);
|
|
449
453
|
}
|
|
450
454
|
|
|
451
|
-
if
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
+
// Finalize the data block if it's full or if it contains pending values when there's
|
|
456
|
+
// no more left to merge.
|
|
457
|
+
if (compaction.table_builder.data_block_full() or
|
|
458
|
+
(merge_iterator.empty() and !compaction.table_builder.data_block_empty()))
|
|
459
|
+
{
|
|
455
460
|
compaction.table_builder.data_block_finish(.{
|
|
456
461
|
.cluster = compaction.grid.superblock.working.cluster,
|
|
457
|
-
.address = compaction.grid.acquire(),
|
|
462
|
+
.address = compaction.grid.acquire(compaction.grid_reservation),
|
|
458
463
|
});
|
|
459
|
-
swap_buffers(&compaction.data, &compaction.table_builder.data_block);
|
|
460
|
-
assert(compaction.data.ready);
|
|
461
464
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
}
|
|
465
|
+
// Mark the finished data block as writable for the next compact_tick() call.
|
|
466
|
+
compaction.data.block = compaction.table_builder.data_block;
|
|
467
|
+
assert(!compaction.data.writable);
|
|
468
|
+
compaction.data.writable = true;
|
|
467
469
|
}
|
|
468
470
|
|
|
471
|
+
// Finalize the filter block if it's full or if it contains pending data blocks
|
|
472
|
+
// when there's no more merged values to fill them.
|
|
469
473
|
if (compaction.table_builder.filter_block_full() or
|
|
470
|
-
compaction.table_builder.
|
|
471
|
-
compaction.merge_iterator.empty())
|
|
474
|
+
(merge_iterator.empty() and !compaction.table_builder.filter_block_empty()))
|
|
472
475
|
{
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
assert(compaction.filter.ready);
|
|
483
|
-
}
|
|
476
|
+
compaction.table_builder.filter_block_finish(.{
|
|
477
|
+
.cluster = compaction.grid.superblock.working.cluster,
|
|
478
|
+
.address = compaction.grid.acquire(compaction.grid_reservation),
|
|
479
|
+
});
|
|
480
|
+
|
|
481
|
+
// Mark the finished filter block as writable for the next compact_tick() call.
|
|
482
|
+
compaction.filter.block = compaction.table_builder.filter_block;
|
|
483
|
+
assert(!compaction.filter.writable);
|
|
484
|
+
compaction.filter.writable = true;
|
|
484
485
|
}
|
|
485
486
|
|
|
487
|
+
// Finalize the index block if it's full or if it contains pending data blocks
|
|
488
|
+
// when there's no more merged values to fill them.
|
|
486
489
|
if (compaction.table_builder.index_block_full() or
|
|
487
|
-
|
|
490
|
+
(merge_iterator.empty() and !compaction.table_builder.index_block_empty()))
|
|
488
491
|
{
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
.snapshot_min = snapshot_min,
|
|
498
|
-
});
|
|
499
|
-
compaction.queue_manifest_update(&compaction.insert_level_b, &table);
|
|
500
|
-
|
|
501
|
-
swap_buffers(&compaction.index, &compaction.table_builder.index_block);
|
|
502
|
-
assert(compaction.index.ready);
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
fn iterator_a_callback(iterator_a: *IteratorA) void {
|
|
508
|
-
const compaction = @fieldParentPtr(Compaction, "iterator_a", iterator_a);
|
|
509
|
-
compaction.io_callback();
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
fn iterator_b_callback(iterator_b: *IteratorB) void {
|
|
513
|
-
const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
|
|
514
|
-
compaction.io_callback();
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
fn io_callback(compaction: *Compaction) void {
|
|
518
|
-
compaction.io_pending -= 1;
|
|
519
|
-
if (compaction.io_pending == 0) compaction.tick_done();
|
|
520
|
-
}
|
|
492
|
+
const table = compaction.table_builder.index_block_finish(.{
|
|
493
|
+
.cluster = compaction.grid.superblock.working.cluster,
|
|
494
|
+
.address = compaction.grid.acquire(compaction.grid_reservation),
|
|
495
|
+
.snapshot_min = snapshot_min_for_table_output(compaction.op_min),
|
|
496
|
+
// TODO(Persistent Snapshots) set snapshot_max to the minimum snapshot_max of
|
|
497
|
+
// all the (original) input tables.
|
|
498
|
+
});
|
|
499
|
+
compaction.manifest.insert_table(compaction.level_b, &table);
|
|
521
500
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
) void {
|
|
527
|
-
if (block_write.ready) {
|
|
528
|
-
block_write.ready = false;
|
|
501
|
+
// Mark the finished index block as writable for the next compact_tick() call.
|
|
502
|
+
compaction.index.block = compaction.table_builder.index_block;
|
|
503
|
+
assert(!compaction.index.writable);
|
|
504
|
+
compaction.index.writable = true;
|
|
529
505
|
|
|
530
|
-
compaction.
|
|
531
|
-
compaction.
|
|
532
|
-
callback,
|
|
533
|
-
&block_write.write,
|
|
534
|
-
block_write.block,
|
|
535
|
-
Table.block_address(block_write.block),
|
|
536
|
-
);
|
|
506
|
+
compaction.tables_output_count += 1;
|
|
507
|
+
assert(compaction.tables_output_count <= compaction.range.table_count);
|
|
537
508
|
}
|
|
538
509
|
}
|
|
539
510
|
|
|
540
|
-
fn
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
511
|
+
fn cpu_merge_finish(compaction: *Compaction) void {
|
|
512
|
+
// Ensure this is the result of a compact_tick() call that finished processing IO.
|
|
513
|
+
assert(compaction.status == .processing);
|
|
514
|
+
assert(compaction.callback != null);
|
|
515
|
+
assert(compaction.io_pending == 0);
|
|
516
|
+
assert(!compaction.merge_done);
|
|
545
517
|
|
|
546
|
-
|
|
518
|
+
// Ensure merging is truly finished.
|
|
519
|
+
assert(compaction.merge_iterator.?.empty());
|
|
520
|
+
assert(!compaction.data.writable);
|
|
521
|
+
assert(!compaction.filter.writable);
|
|
522
|
+
assert(!compaction.index.writable);
|
|
523
|
+
|
|
524
|
+
// Double check the iterators are finished as well.
|
|
525
|
+
const stream_empty = struct {
|
|
526
|
+
fn empty(it: anytype) bool {
|
|
527
|
+
_ = it.peek() catch |err| switch (err) {
|
|
528
|
+
error.Drained => {},
|
|
529
|
+
error.Empty => {
|
|
530
|
+
assert(it.buffered_all_values());
|
|
531
|
+
return true;
|
|
532
|
+
},
|
|
533
|
+
};
|
|
534
|
+
return false;
|
|
547
535
|
}
|
|
548
|
-
}.
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
assert(compaction.iterator_a.peek() == null);
|
|
561
|
-
|
|
562
|
-
assert(compaction.iterator_b.buffered_all_values());
|
|
563
|
-
assert(compaction.iterator_b.peek() == null);
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
fn stream_peek(compaction: *Compaction, stream_id: u32) ?Key {
|
|
567
|
-
assert(stream_id <= 1);
|
|
568
|
-
|
|
569
|
-
if (stream_id == 0) {
|
|
570
|
-
return compaction.iterator_a.peek();
|
|
536
|
+
}.empty;
|
|
537
|
+
assert(stream_empty(&compaction.iterator_a));
|
|
538
|
+
assert(stream_empty(&compaction.iterator_b));
|
|
539
|
+
|
|
540
|
+
// Mark the level_a table as invisible if it was provided;
|
|
541
|
+
// it has been merged into level_b.
|
|
542
|
+
// TODO: Release the grid blocks associated with level_a as well
|
|
543
|
+
if (compaction.level_a_input) |*level_a_table| {
|
|
544
|
+
const level_a = compaction.level_b - 1;
|
|
545
|
+
const snapshot_max = snapshot_max_for_table_input(compaction.op_min);
|
|
546
|
+
compaction.manifest.update_table(level_a, snapshot_max, level_a_table);
|
|
547
|
+
assert(level_a_table.snapshot_max == snapshot_max);
|
|
571
548
|
} else {
|
|
572
|
-
|
|
549
|
+
assert(compaction.level_b == 0);
|
|
573
550
|
}
|
|
551
|
+
|
|
552
|
+
// Finally, mark Compaction as officially complete and ready to be reset().
|
|
553
|
+
compaction.merge_iterator = null;
|
|
554
|
+
compaction.merge_done = true;
|
|
555
|
+
compaction.status = .done;
|
|
574
556
|
}
|
|
575
557
|
|
|
576
|
-
fn
|
|
577
|
-
assert(
|
|
558
|
+
pub fn reset(compaction: *Compaction) void {
|
|
559
|
+
assert(compaction.status == .done);
|
|
560
|
+
assert(compaction.callback == null);
|
|
561
|
+
assert(compaction.io_pending == 0);
|
|
562
|
+
assert(compaction.merge_done);
|
|
578
563
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
return compaction.iterator_b.pop();
|
|
583
|
-
}
|
|
584
|
-
}
|
|
564
|
+
// TODO(Beat Pacing) This should really be where the compaction callback is invoked,
|
|
565
|
+
// but currently that can occur multiple times per beat.
|
|
566
|
+
compaction.grid.forfeit(compaction.grid_reservation);
|
|
585
567
|
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
///
|
|
589
|
-
/// This assumes that all overlapping tables in level A at the time the compaction was
|
|
590
|
-
/// started are included in the compaction. If this is not the case, the older table
|
|
591
|
-
/// in a pair of overlapping tables could be left in level A and shadow the newer table
|
|
592
|
-
/// in level B, resulting in data loss/invalid data.
|
|
593
|
-
fn stream_precedence(compaction: *Compaction, a: u32, b: u32) bool {
|
|
594
|
-
_ = compaction;
|
|
595
|
-
|
|
596
|
-
assert(a + b == 1);
|
|
597
|
-
|
|
598
|
-
// A stream_id of 0 indicates the level A iterator.
|
|
599
|
-
// All tables in level A have higher precedence.
|
|
600
|
-
return a == 0;
|
|
568
|
+
compaction.status = .idle;
|
|
569
|
+
compaction.merge_done = false;
|
|
601
570
|
}
|
|
602
571
|
};
|
|
603
572
|
}
|
|
573
|
+
|
|
574
|
+
fn snapshot_max_for_table_input(op_min: u64) u64 {
|
|
575
|
+
assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
|
|
576
|
+
return op_min + @divExact(config.lsm_batch_multiple, 2) - 1;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
fn snapshot_min_for_table_output(op_min: u64) u64 {
|
|
580
|
+
assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
|
|
581
|
+
return op_min + @divExact(config.lsm_batch_multiple, 2);
|
|
582
|
+
}
|