@nxtedition/rocksdb 12.1.4 → 12.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +2 -2
- package/deps/rocksdb/rocksdb/Makefile +10 -5
- package/deps/rocksdb/rocksdb/TARGETS +9 -7
- package/deps/rocksdb/rocksdb/cache/cache.cc +15 -11
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +26 -0
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +16 -0
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +6 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +38 -8
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +4 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +11 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +6 -0
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +2 -1
- package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +56 -0
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +12 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +10 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +9 -0
- package/deps/rocksdb/rocksdb/db/c.cc +9 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +12 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +6 -23
- package/deps/rocksdb/rocksdb/db/column_family.h +1 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +4 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +19 -16
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +34 -30
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +16 -31
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +7 -50
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +95 -84
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +616 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +8 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +93 -69
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +353 -89
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +4 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +116 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +67 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +42 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +50 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +79 -32
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +36 -59
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +72 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -12
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +75 -0
- package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -3
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +24 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +36 -22
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +23 -0
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/error_handler.cc +28 -3
- package/deps/rocksdb/rocksdb/db/error_handler.h +2 -1
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +165 -33
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -5
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +37 -28
- package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -6
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -6
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -6
- package/deps/rocksdb/rocksdb/db/job_context.h +4 -0
- package/deps/rocksdb/rocksdb/db/memtable.cc +24 -14
- package/deps/rocksdb/rocksdb/db/memtable.h +2 -1
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +61 -33
- package/deps/rocksdb/rocksdb/db/memtable_list.h +8 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +4 -2
- package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
- package/deps/rocksdb/rocksdb/db/version_builder.cc +14 -11
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +20 -4
- package/deps/rocksdb/rocksdb/db/version_set.cc +40 -30
- package/deps/rocksdb/rocksdb/db/version_set.h +13 -3
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -76
- package/deps/rocksdb/rocksdb/db/write_batch.cc +6 -2
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +25 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +11 -0
- package/deps/rocksdb/rocksdb/env/fs_remap.h +5 -0
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +11 -1
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +10 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +30 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +10 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +287 -83
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +68 -36
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +4 -4
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +31 -0
- package/deps/rocksdb/rocksdb/options/db_options.cc +14 -0
- package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
- package/deps/rocksdb/rocksdb/options/options_helper.cc +15 -4
- package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
- package/deps/rocksdb/rocksdb/options/options_parser.cc +5 -4
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -1
- package/deps/rocksdb/rocksdb/options/options_test.cc +38 -45
- package/deps/rocksdb/rocksdb/port/port.h +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +8 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +10 -20
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -9
- package/deps/rocksdb/rocksdb/table/format.cc +32 -4
- package/deps/rocksdb/rocksdb/table/format.h +12 -1
- package/deps/rocksdb/rocksdb/table/iterator.cc +4 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +214 -161
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +4 -2
- package/deps/rocksdb/rocksdb/table/table_properties.cc +4 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +2 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +5 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
- package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -2
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +213 -22
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -0
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +1 -1
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +3 -0
- package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +3 -3
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -19,80 +19,82 @@ struct CompactionJobStats {
|
|
|
19
19
|
void Add(const CompactionJobStats& stats);
|
|
20
20
|
|
|
21
21
|
// the elapsed time of this compaction in microseconds.
|
|
22
|
-
uint64_t elapsed_micros;
|
|
22
|
+
uint64_t elapsed_micros = 0;
|
|
23
23
|
|
|
24
24
|
// the elapsed CPU time of this compaction in microseconds.
|
|
25
|
-
uint64_t cpu_micros;
|
|
25
|
+
uint64_t cpu_micros = 0;
|
|
26
26
|
|
|
27
27
|
// Used internally indicating whether a subcompaction's
|
|
28
28
|
// `num_input_records` is accurate.
|
|
29
|
-
bool has_num_input_records;
|
|
29
|
+
bool has_num_input_records = false;
|
|
30
30
|
// the number of compaction input records.
|
|
31
|
-
uint64_t num_input_records;
|
|
31
|
+
uint64_t num_input_records = 0;
|
|
32
32
|
// the number of blobs read from blob files
|
|
33
|
-
uint64_t num_blobs_read;
|
|
33
|
+
uint64_t num_blobs_read = 0;
|
|
34
34
|
// the number of compaction input files (table files)
|
|
35
|
-
size_t num_input_files;
|
|
35
|
+
size_t num_input_files = 0;
|
|
36
36
|
// the number of compaction input files at the output level (table files)
|
|
37
|
-
size_t num_input_files_at_output_level;
|
|
37
|
+
size_t num_input_files_at_output_level = 0;
|
|
38
38
|
|
|
39
39
|
// the number of compaction output records.
|
|
40
|
-
uint64_t num_output_records;
|
|
40
|
+
uint64_t num_output_records = 0;
|
|
41
41
|
// the number of compaction output files (table files)
|
|
42
|
-
size_t num_output_files;
|
|
42
|
+
size_t num_output_files = 0;
|
|
43
43
|
// the number of compaction output files (blob files)
|
|
44
|
-
size_t num_output_files_blob;
|
|
44
|
+
size_t num_output_files_blob = 0;
|
|
45
45
|
|
|
46
46
|
// true if the compaction is a full compaction (all live SST files input)
|
|
47
|
-
bool is_full_compaction;
|
|
47
|
+
bool is_full_compaction = false;
|
|
48
48
|
// true if the compaction is a manual compaction
|
|
49
|
-
bool is_manual_compaction;
|
|
49
|
+
bool is_manual_compaction = false;
|
|
50
|
+
// true if the compaction ran in a remote worker
|
|
51
|
+
bool is_remote_compaction = false;
|
|
50
52
|
|
|
51
53
|
// the total size of table files in the compaction input
|
|
52
|
-
uint64_t total_input_bytes;
|
|
54
|
+
uint64_t total_input_bytes = 0;
|
|
53
55
|
// the total size of blobs read from blob files
|
|
54
|
-
uint64_t total_blob_bytes_read;
|
|
56
|
+
uint64_t total_blob_bytes_read = 0;
|
|
55
57
|
// the total size of table files in the compaction output
|
|
56
|
-
uint64_t total_output_bytes;
|
|
58
|
+
uint64_t total_output_bytes = 0;
|
|
57
59
|
// the total size of blob files in the compaction output
|
|
58
|
-
uint64_t total_output_bytes_blob;
|
|
60
|
+
uint64_t total_output_bytes_blob = 0;
|
|
59
61
|
|
|
60
62
|
// number of records being replaced by newer record associated with same key.
|
|
61
63
|
// this could be a new value or a deletion entry for that key so this field
|
|
62
64
|
// sums up all updated and deleted keys
|
|
63
|
-
uint64_t num_records_replaced;
|
|
65
|
+
uint64_t num_records_replaced = 0;
|
|
64
66
|
|
|
65
67
|
// the sum of the uncompressed input keys in bytes.
|
|
66
|
-
uint64_t total_input_raw_key_bytes;
|
|
68
|
+
uint64_t total_input_raw_key_bytes = 0;
|
|
67
69
|
// the sum of the uncompressed input values in bytes.
|
|
68
|
-
uint64_t total_input_raw_value_bytes;
|
|
70
|
+
uint64_t total_input_raw_value_bytes = 0;
|
|
69
71
|
|
|
70
72
|
// the number of deletion entries before compaction. Deletion entries
|
|
71
73
|
// can disappear after compaction because they expired
|
|
72
|
-
uint64_t num_input_deletion_records;
|
|
74
|
+
uint64_t num_input_deletion_records = 0;
|
|
73
75
|
// number of deletion records that were found obsolete and discarded
|
|
74
76
|
// because it is not possible to delete any more keys with this entry
|
|
75
77
|
// (i.e. all possible deletions resulting from it have been completed)
|
|
76
|
-
uint64_t num_expired_deletion_records;
|
|
78
|
+
uint64_t num_expired_deletion_records = 0;
|
|
77
79
|
|
|
78
80
|
// number of corrupt keys (ParseInternalKey returned false when applied to
|
|
79
81
|
// the key) encountered and written out.
|
|
80
|
-
uint64_t num_corrupt_keys;
|
|
82
|
+
uint64_t num_corrupt_keys = 0;
|
|
81
83
|
|
|
82
84
|
// Following counters are only populated if
|
|
83
85
|
// options.report_bg_io_stats = true;
|
|
84
86
|
|
|
85
87
|
// Time spent on file's Append() call.
|
|
86
|
-
uint64_t file_write_nanos;
|
|
88
|
+
uint64_t file_write_nanos = 0;
|
|
87
89
|
|
|
88
90
|
// Time spent on sync file range.
|
|
89
|
-
uint64_t file_range_sync_nanos;
|
|
91
|
+
uint64_t file_range_sync_nanos = 0;
|
|
90
92
|
|
|
91
93
|
// Time spent on file fsync.
|
|
92
|
-
uint64_t file_fsync_nanos;
|
|
94
|
+
uint64_t file_fsync_nanos = 0;
|
|
93
95
|
|
|
94
96
|
// Time spent on preparing file write (fallocate, etc)
|
|
95
|
-
uint64_t file_prepare_write_nanos;
|
|
97
|
+
uint64_t file_prepare_write_nanos = 0;
|
|
96
98
|
|
|
97
99
|
// 0-terminated strings storing the first 8 bytes of the smallest and
|
|
98
100
|
// largest key in the output.
|
|
@@ -102,10 +104,10 @@ struct CompactionJobStats {
|
|
|
102
104
|
std::string largest_output_key_prefix;
|
|
103
105
|
|
|
104
106
|
// number of single-deletes which do not meet a put
|
|
105
|
-
uint64_t num_single_del_fallthru;
|
|
107
|
+
uint64_t num_single_del_fallthru = 0;
|
|
106
108
|
|
|
107
109
|
// number of single-deletes which meet something other than a put
|
|
108
|
-
uint64_t num_single_del_mismatch;
|
|
110
|
+
uint64_t num_single_del_mismatch = 0;
|
|
109
111
|
|
|
110
112
|
// TODO: Add output_to_penultimate_level output information
|
|
111
113
|
};
|
|
@@ -179,13 +179,18 @@ class Comparator : public Customizable, public CompareInterface {
|
|
|
179
179
|
size_t timestamp_size_;
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
-
// Return a builtin comparator that uses lexicographic
|
|
183
|
-
//
|
|
184
|
-
//
|
|
182
|
+
// Return a builtin comparator that uses lexicographic ordering
|
|
183
|
+
// on unsigned bytes, so the empty string is ordered before everything
|
|
184
|
+
// else and a sufficiently long string of \xFF orders after anything.
|
|
185
|
+
// CanKeysWithDifferentByteContentsBeEqual() == false
|
|
186
|
+
// Returns an immortal pointer that must not be deleted by the caller.
|
|
185
187
|
const Comparator* BytewiseComparator();
|
|
186
188
|
|
|
187
|
-
// Return a builtin comparator that
|
|
188
|
-
//
|
|
189
|
+
// Return a builtin comparator that is the reverse ordering of
|
|
190
|
+
// BytewiseComparator(), so the empty string is ordered after everything
|
|
191
|
+
// else and a sufficiently long string of \xFF orders before anything.
|
|
192
|
+
// CanKeysWithDifferentByteContentsBeEqual() == false
|
|
193
|
+
// Returns an immortal pointer that must not be deleted by the caller.
|
|
189
194
|
const Comparator* ReverseBytewiseComparator();
|
|
190
195
|
|
|
191
196
|
// Returns a builtin comparator that enables user-defined timestamps (formatted
|
|
@@ -56,7 +56,9 @@ struct ConfigOptions {
|
|
|
56
56
|
// setting
|
|
57
57
|
};
|
|
58
58
|
|
|
59
|
-
// When true, any unused options will be ignored and OK will be returned
|
|
59
|
+
// When true, any unused options will be ignored and OK will be returned.
|
|
60
|
+
// For options files that appear to be from the current version or earlier,
|
|
61
|
+
// unknown options are considered corruption regardless of this setting.
|
|
60
62
|
bool ignore_unknown_options = false;
|
|
61
63
|
|
|
62
64
|
// When true, any unsupported options will be ignored and OK will be returned
|
|
@@ -61,83 +61,89 @@ Status UpdateManifestForFilesState(
|
|
|
61
61
|
// EXPERIMENTAL new filtering features
|
|
62
62
|
// ****************************************************************************
|
|
63
63
|
|
|
64
|
-
// A class for splitting a key into meaningful pieces, or
|
|
65
|
-
// filtering purposes.
|
|
66
|
-
//
|
|
67
|
-
// requirements, the segments must encompass a complete key prefix (or
|
|
68
|
-
// key)
|
|
69
|
-
//
|
|
70
|
-
//
|
|
71
|
-
//
|
|
64
|
+
// KeySegmentsExtractor - A class for splitting a key into meaningful pieces, or
|
|
65
|
+
// "segments" for filtering purposes. We say the first key segment has segment
|
|
66
|
+
// ordinal 0, the second has segment ordinal 1, etc. To simplify satisfying some
|
|
67
|
+
// filtering requirements, the segments must encompass a complete key prefix (or
|
|
68
|
+
// the whole key). There cannot be gaps between segments (though segments are
|
|
69
|
+
// allowed to be essentially unused), and segments cannot overlap.
|
|
70
|
+
//
|
|
71
|
+
// Keys can also be put in "categories" to simplify some configuration and
|
|
72
|
+
// handling. A "legal" key or bound is one that does not return an error (as a
|
|
73
|
+
// special, unused category) from the extractor. It is also allowed for all
|
|
74
|
+
// keys in a category to return an empty sequence of segments.
|
|
75
|
+
//
|
|
76
|
+
// To eliminate a confusing distinction between a segment that is empty vs.
|
|
77
|
+
// "not present" for a particular key, each key is logically assiciated with
|
|
78
|
+
// an infinite sequence of segments, including some infinite tail of 0-length
|
|
79
|
+
// segments. In practice, we only represent a finite sequence that (at least)
|
|
80
|
+
// covers the non-trivial segments.
|
|
81
|
+
//
|
|
82
|
+
// Once in production, the behavior associated with a particular GetId()
|
|
83
|
+
// cannot change. Introduce a new GetId() when introducing new behaviors.
|
|
72
84
|
// See also SstQueryFilterConfigsManager below.
|
|
73
85
|
//
|
|
74
|
-
//
|
|
75
|
-
//
|
|
76
|
-
//
|
|
77
|
-
//
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
// *
|
|
81
|
-
//
|
|
82
|
-
//
|
|
83
|
-
//
|
|
84
|
-
//
|
|
85
|
-
// SEGMENT
|
|
86
|
-
//
|
|
87
|
-
//
|
|
88
|
-
//
|
|
89
|
-
//
|
|
90
|
-
//
|
|
91
|
-
//
|
|
92
|
-
//
|
|
93
|
-
//
|
|
94
|
-
//
|
|
95
|
-
// the
|
|
96
|
-
//
|
|
97
|
-
//
|
|
98
|
-
//
|
|
99
|
-
//
|
|
100
|
-
//
|
|
101
|
-
//
|
|
102
|
-
//
|
|
103
|
-
//
|
|
104
|
-
//
|
|
105
|
-
//
|
|
106
|
-
//
|
|
107
|
-
//
|
|
108
|
-
//
|
|
109
|
-
//
|
|
110
|
-
//
|
|
111
|
-
//
|
|
112
|
-
// (
|
|
113
|
-
//
|
|
114
|
-
//
|
|
115
|
-
//
|
|
116
|
-
//
|
|
117
|
-
//
|
|
118
|
-
//
|
|
119
|
-
// processing each byte, the extractor decides whether to cut a segment that
|
|
120
|
-
// ends with that byte, but not one that ends before that byte. The only
|
|
121
|
-
// exception is that upon reaching the end of the key, the extractor can choose
|
|
122
|
-
// whether to make a segment that ends at the end of the key.
|
|
86
|
+
// This feature hasn't yet been validated with user timestamp.
|
|
87
|
+
//
|
|
88
|
+
// = A SIMPLIFIED MODEL =
|
|
89
|
+
// Let us start with the easiest set of contraints to satisfy with a key
|
|
90
|
+
// segments extractor that generally allows for correct point and range
|
|
91
|
+
// filtering, and add complexity from there. Here we first assume
|
|
92
|
+
// * The column family is using the byte-wise comparator, or reverse byte-wise
|
|
93
|
+
// * A single category is assigned to all keys (by the extractor)
|
|
94
|
+
// * Using simplified criteria for legal segment extraction, the "segment
|
|
95
|
+
// maximal prefix property"
|
|
96
|
+
//
|
|
97
|
+
// SEGMENT MAXIMAL PREFIX PROPERTY: The segment that a byte is assigned to can
|
|
98
|
+
// only depend on the bytes that come before it, not on the byte itself nor
|
|
99
|
+
// anything later including the full length of the key or bound.
|
|
100
|
+
//
|
|
101
|
+
// Equivalently, two keys or bounds must agree on the segment assignment of
|
|
102
|
+
// position i if the two keys share a common byte-wise prefix up to at least
|
|
103
|
+
// position i - 1 (and i is within bounds of both keys).
|
|
104
|
+
//
|
|
105
|
+
// This specifically excludes "all or nothing" segments where it is only
|
|
106
|
+
// included if it reaches a particular width or delimiter. A segment resembling
|
|
107
|
+
// the FixedPrefixTransform would be illegal (without other assumptions); it
|
|
108
|
+
// must be like CappedPrefixTransform.
|
|
109
|
+
//
|
|
110
|
+
// This basically matches the notion of parsing prefix codes (see
|
|
111
|
+
// https://en.wikipedia.org/wiki/Prefix_code) except we have to include any
|
|
112
|
+
// partial segment (code word) at the end whenever an extension to that key
|
|
113
|
+
// might produce a full segment. An example would be parsing UTF-8 into
|
|
114
|
+
// segments corresponding to encoded code points, where any incomplete code
|
|
115
|
+
// at the end must be part of a trailing segment. Note a three-way
|
|
116
|
+
// correspondence between
|
|
117
|
+
// (a) byte-wise ordering of encoded code points, e.g.
|
|
118
|
+
// { D0 98 E2 82 AC }
|
|
119
|
+
// { E2 82 AC D0 98 }
|
|
120
|
+
// (b) lexicographic-then-byte-wise ordering of segments that are each an
|
|
121
|
+
// encoded code point, e.g.
|
|
122
|
+
// {{ D0 98 } { E2 82 AC }}
|
|
123
|
+
// {{ E2 82 AC } { D0 98 }}
|
|
124
|
+
// and (c) lexicographic ordering of the decoded code points, e.g.
|
|
125
|
+
// { U+0418 U+20AC }
|
|
126
|
+
// { U+20AC U+0418 }
|
|
127
|
+
// The correspondence between (a) and (b) is a result of the segment maximal
|
|
128
|
+
// prefix property and is critical for correct application of filters to
|
|
129
|
+
// range queries. The correspondence with (c) is a handy attribute of UTF-8
|
|
130
|
+
// (with no over-long encodings) and might be useful to the application.
|
|
123
131
|
//
|
|
124
132
|
// Example types of key segments that can be freely mixed in any order:
|
|
125
|
-
// *
|
|
126
|
-
//
|
|
127
|
-
//
|
|
128
|
-
// prefix property.)
|
|
129
|
-
// * Length-encoded sequence of bytes or codewords. The length could even
|
|
130
|
-
// come from a preceding segment.
|
|
133
|
+
// * Capped number of bytes or codewords. The number cap for the segment
|
|
134
|
+
// could be the same for all keys or encoded earlier in the key.
|
|
135
|
+
// * Up to *and including* a delimiter byte or codeword.
|
|
131
136
|
// * Any/all remaining bytes to the end of the key, though this implies all
|
|
132
137
|
// subsequent segments will be empty.
|
|
133
|
-
//
|
|
134
|
-
//
|
|
135
|
-
//
|
|
138
|
+
// As part of the segment maximal prefix property, if the segments do not
|
|
139
|
+
// extend to the end of the key, that must be implied by the bytes that are
|
|
140
|
+
// in segments, NOT because the potential contents of a segment were considered
|
|
141
|
+
// incomplete.
|
|
136
142
|
//
|
|
137
143
|
// For example, keys might consist of
|
|
138
144
|
// * Segment 0: Any sequence of bytes up to and including the first ':'
|
|
139
145
|
// character, or the whole key if no ':' is present.
|
|
140
|
-
// * Segment 1: The next four bytes,
|
|
146
|
+
// * Segment 1: The next four bytes, or less if we reach end of key.
|
|
141
147
|
// * Segment 2: An unsigned byte indicating the number of additional bytes in
|
|
142
148
|
// the segment, and then that many bytes (or less up to the end of the key).
|
|
143
149
|
// * Segment 3: Any/all remaining bytes in the key
|
|
@@ -145,22 +151,208 @@ Status UpdateManifestForFilesState(
|
|
|
145
151
|
// For an example of what can go wrong, consider using '4' as a delimiter
|
|
146
152
|
// but not including it with the segment leading up to it. Suppose we have
|
|
147
153
|
// these keys and corresponding first segments:
|
|
148
|
-
// "123456" -> "123"
|
|
149
|
-
// "124536" -> "12"
|
|
150
|
-
// "125436" -> "125"
|
|
154
|
+
// "123456" -> "123" (in file 1)
|
|
155
|
+
// "124536" -> "12" (in file 2)
|
|
156
|
+
// "125436" -> "125" (in file 1)
|
|
151
157
|
// Notice how byte-wise comparator ordering of the segments does not follow
|
|
152
158
|
// the ordering of the keys. This means we cannot safely use a filter with
|
|
153
|
-
// a range of segment values for filtering key range queries.
|
|
159
|
+
// a range of segment values for filtering key range queries. For example,
|
|
160
|
+
// we might get a range query for ["123", "125Z") and miss that key "124536"
|
|
161
|
+
// in file 2 is in range because its first segment "12" is out of the range
|
|
162
|
+
// of the first segments on the bounds, "123" and "125". We cannot even safely
|
|
163
|
+
// use this for prefix-like range querying with a Bloom filter on the segments.
|
|
164
|
+
// For a query ["12", "124Z"), segment "12" would likely not match the Bloom
|
|
165
|
+
// filter in file 1 and miss "123456".
|
|
154
166
|
//
|
|
155
|
-
//
|
|
156
|
-
//
|
|
167
|
+
// CATEGORIES: The KeySegmentsExtractor is allowed to place keys in categories
|
|
168
|
+
// so that different parts of the key space can use different filtering
|
|
169
|
+
// strategies. The following property is generally recommended for safe filter
|
|
170
|
+
// applicability
|
|
171
|
+
// * CATEGORY CONTIGUOUSNESS PROPERTY: each category is contiguous in
|
|
172
|
+
// comparator order. In other words, any key between two keys of category c
|
|
173
|
+
// must also be in category c.
|
|
174
|
+
// An alternative to categories when distinct kinds of keys are interspersed
|
|
175
|
+
// is to leave some segments empty when they do not apply to that key.
|
|
176
|
+
// Filters are generally set up to handle an empty segment specially so that
|
|
177
|
+
// it doesn't interfere with tracking accurate ranges on non-empty occurrences
|
|
178
|
+
// of the segment.
|
|
157
179
|
//
|
|
158
|
-
//
|
|
159
|
-
//
|
|
160
|
-
//
|
|
161
|
-
//
|
|
162
|
-
//
|
|
180
|
+
// = BEYOND THE SIMPLIFIED MODEL =
|
|
181
|
+
//
|
|
182
|
+
// DETAILED GENERAL REQUIREMENTS (incl OTHER COMPARATORS): The exact
|
|
183
|
+
// requirements on a key segments extractor depend on whether and how we use
|
|
184
|
+
// filters to answer queries that they cannot answer directly. To understand
|
|
185
|
+
// this, we describe
|
|
186
|
+
// (A) the types of filters in terms of data they represent and can directly
|
|
187
|
+
// answer queries about,
|
|
188
|
+
// (B) the types of read queries that we want to use filters for, and
|
|
189
|
+
// (C) the assumptions that need to be satisfied to connect those two.
|
|
190
|
+
//
|
|
191
|
+
// TYPES OF FILTERS: Although not exhaustive, here are some useful categories
|
|
192
|
+
// of filter data:
|
|
193
|
+
// * Equivalence class filtering - Represents or over-approximates a set of
|
|
194
|
+
// equivalence classes on keys. The size of the representation is roughly
|
|
195
|
+
// proportional to the number of equivalence classes added. Bloom and ribbon
|
|
196
|
+
// filters are examples.
|
|
197
|
+
// * Order-based filtering - Represents one or more subranges of a key space or
|
|
198
|
+
// key segment space. A filter query only requires application of the CF
|
|
199
|
+
// comparator. The size of the representation is roughly proportional to the
|
|
200
|
+
// number of subranges and to the key or segment size. For example, we call a
|
|
201
|
+
// simple filter representing a minimum and a maximum value for a segment a
|
|
202
|
+
// min-max filter.
|
|
163
203
|
//
|
|
204
|
+
// TYPES OF READ QUERIES and their DIRECT FILTERS:
|
|
205
|
+
// * Point query - Whether there {definitely isn't, might be} an entry for a
|
|
206
|
+
// particular key in an SST file (or partition, etc.).
|
|
207
|
+
// The DIRECT FILTER for a point query is an equivalence class filter on the
|
|
208
|
+
// whole key.
|
|
209
|
+
// * Range query - Whether there {definitely isn't, might be} any entries
|
|
210
|
+
// within a lower and upper key bound, in an SST file (or partition, etc.).
|
|
211
|
+
// NOTE: For this disucssion, we ignore the detail of inclusive vs.
|
|
212
|
+
// exclusive bounds by assuming a generalized notion of "bound" (vs. key)
|
|
213
|
+
// that conveniently represents spaces between keys. For details, see
|
|
214
|
+
// https://github.com/facebook/rocksdb/pull/11434
|
|
215
|
+
// The DIRECT FILTER for a range query is an order-based filter on the whole
|
|
216
|
+
// key (non-empty intersection of bounds/keys). Simple minimum and maximum
|
|
217
|
+
// keys for each SST file are automatically provided by metadata and used in
|
|
218
|
+
// the read path for filtering (as well as binary search indexing).
|
|
219
|
+
// PARTITIONING NOTE: SST metadata partitions do not have recorded minimum
|
|
220
|
+
// and maximum keys, so require some special handling for range query
|
|
221
|
+
// filtering. See https://github.com/facebook/rocksdb/pull/12872 etc.
|
|
222
|
+
// * Where clauses - Additional constraints that can be put on range queries.
|
|
223
|
+
// Specifically, a where clause is a tuple <i,j,c,b1,b2> representing that the
|
|
224
|
+
// concatenated sequence of segments from i to j (inclusive) compares between
|
|
225
|
+
// b1 and b2 according to comparator c.
|
|
226
|
+
// EXAMPLE: To represent that segment of ordinal i is equal to s, that would
|
|
227
|
+
// be <i,i,bytewise_comparator,before(s),after(s)>.
|
|
228
|
+
// NOTE: To represent something like segment has a particular prefix, you
|
|
229
|
+
// would need to split the key into more segments appropriately. There is
|
|
230
|
+
// little loss of generality because we can combine adjacent segments for
|
|
231
|
+
// specifying where clauses and implementing filters.
|
|
232
|
+
// The DIRECT FILTER for a where clause is an order-based filter on the same
|
|
233
|
+
// sequence of segments and comparator (non-empty intersection of bounds/keys),
|
|
234
|
+
// or in the special case of an equality clause (see example), an equivalence
|
|
235
|
+
// class filter on the sequence of segments.
|
|
236
|
+
//
|
|
237
|
+
// GENERALIZING FILTERS (INDIRECT):
|
|
238
|
+
// * Point queries can utilize essentially any kind of filter by extracting
|
|
239
|
+
// applicable segments of the query key (if not using whole key) and querying
|
|
240
|
+
// the corresponding equivalence class or trivial range.
|
|
241
|
+
// NOTE: There is NO requirement e.g. that the comparator used by the filter
|
|
242
|
+
// match the CF key comparator or similar. The extractor simply needs to be
|
|
243
|
+
// a pure function that does not return "out of bounds" segments.
|
|
244
|
+
// FOR EXAMPLE, a min-max filter on the 4th segment of keys can also be
|
|
245
|
+
// used for filtering point queries (Get/MultiGet) and could be as
|
|
246
|
+
// effective and much more space efficient than a Bloom filter, depending
|
|
247
|
+
// on the workload.
|
|
248
|
+
//
|
|
249
|
+
// Beyond point queries, we generally expect the key comparator to be a
|
|
250
|
+
// lexicographic / big endian ordering at a high level (or the reverse of that
|
|
251
|
+
// ordering), while each segment can use an arbitrary comparator.
|
|
252
|
+
// FOR EXAMPLE, with a custom key comparator and segments extractor,
|
|
253
|
+
// segment 0 could be a 4-byte unsigned little-endian integer,
|
|
254
|
+
// segment 1 could be an 8-byte signed big-endian integer. This framework
|
|
255
|
+
// requires segment 0 to come before segment 1 in the key and to take
|
|
256
|
+
// precedence in key ordering (i.e. segment 1 order is only consulted when
|
|
257
|
+
// keys are equal in segment 0).
|
|
258
|
+
//
|
|
259
|
+
// * Equivalence class filters can apply to range queries under conditions
|
|
260
|
+
// resembling legacy prefix filtering (prefix_extractor). An equivalence class
|
|
261
|
+
// filter on segments i through j and category set s is applicable to a range
|
|
262
|
+
// query from lb to ub if
|
|
263
|
+
// * All segments through j extracted from lb and ub are equal.
|
|
264
|
+
// NOTE: being in the same filtering equivalence class is insufficient, as
|
|
265
|
+
// that could be unrelated inputs with a hash collision. Here we are
|
|
266
|
+
// omitting details that would formally accommodate comparators in which
|
|
267
|
+
// different bytes can be considered equal.
|
|
268
|
+
// * The categories of lb and ub are in the category set s.
|
|
269
|
+
// * COMMON SEGMENT PREFIX PROPERTY (for all x, y, z; params j, s): if
|
|
270
|
+
// * Keys x and z have equal segments up through ordinal j, and
|
|
271
|
+
// * Keys x and z are in categories in category set s, and
|
|
272
|
+
// * Key y is ordered x < y < z according to the CF comparator,
|
|
273
|
+
// then both
|
|
274
|
+
// * Key y has equal segments up through ordinal j (compared to x and z)
|
|
275
|
+
// * Key y is in a category in category set s
|
|
276
|
+
// (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
|
|
277
|
+
// model.)
|
|
278
|
+
//
|
|
279
|
+
// * Order-based filters on segments (rather than whole key) can apply to range
|
|
280
|
+
// queries (with "whole key" bounds). Specifically, an order-based filter on
|
|
281
|
+
// segments i through j and category set s is applicable to a range query from
|
|
282
|
+
// lb to ub if
|
|
283
|
+
// * All segments through i-1 extracted from lb and ub are equal
|
|
284
|
+
// * The categories of lb and ub are in the category set s.
|
|
285
|
+
// * SEGMENT ORDERING PROPERTY for ordinal i through j, segments
|
|
286
|
+
// comparator c, category set s, for all x, y, and z: if
|
|
287
|
+
// * Keys x and z have equal segments up through ordinal i-1, and
|
|
288
|
+
// * Keys x and z are in categories in category set s, and
|
|
289
|
+
// * Key y is ordered x < y < z according to the CF comparator,
|
|
290
|
+
// then both
|
|
291
|
+
// * The common segment prefix property is satisifed through ordinal i-1
|
|
292
|
+
// and with category set s
|
|
293
|
+
// * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where
|
|
294
|
+
// x_i..j is the concatenation of segments i through j of key x (etc.).
|
|
295
|
+
// (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
|
|
296
|
+
// model.)
|
|
297
|
+
//
|
|
298
|
+
// INTERESTING EXAMPLES:
|
|
299
|
+
// Consider a segment encoding called BadVarInt1 in which a byte with
|
|
300
|
+
// highest-order bit 1 means "start a new segment". Also consider BadVarInt0
|
|
301
|
+
// which starts a new segment on highest-order bit 0.
|
|
302
|
+
//
|
|
303
|
+
// Configuration: bytewise comp, BadVarInt1 format for segments 0-3 with
|
|
304
|
+
// segment 3 also continuing to the end of the key
|
|
305
|
+
// x = 0x 20 21|82 23|||
|
|
306
|
+
// y = 0x 20 21|82 23 24|85||
|
|
307
|
+
// z = 0x 20 21|82 23|84 25||
|
|
308
|
+
//
|
|
309
|
+
// For i=j=1, this set of keys violate the common segment prefix property and
|
|
310
|
+
// segment ordering property, so can lead to incorrect equivalence class
|
|
311
|
+
// filtering or order-based filtering.
|
|
312
|
+
//
|
|
313
|
+
// Suppose we modify the configuration so that "short" keys (empty in segment
|
|
314
|
+
// 2) are placed in an unfiltered category. In that case, x above doesn't meet
|
|
315
|
+
// the precondition for being limited by segment properties. Consider these
|
|
316
|
+
// keys instead:
|
|
317
|
+
// x = 0x 20 21|82 23 24|85||
|
|
318
|
+
// y = 0x 20 21|82 23 24|85 26|87|
|
|
319
|
+
// z = 0x 20 21|82 23 24|85|86|
|
|
320
|
+
// m = 0x 20 21|82 23 25|85|86|
|
|
321
|
+
// n = 0x 20 21|82 23|84 25||
|
|
322
|
+
//
|
|
323
|
+
// Although segment 1 values might be out of order with key order,
|
|
324
|
+
// re-categorizing the short keys has allowed satisfying the common segment
|
|
325
|
+
// prefix property with j=1 (and with j=0), so we can use equivalence class
|
|
326
|
+
// filters on segment 1, or 0, or 0 to 1. However, violation of the segment
|
|
327
|
+
// ordering property on i=j=1 (see z, m, n) means we can't use order-based.
|
|
328
|
+
//
|
|
329
|
+
// p = 0x 20 21|82 23|84 25 26||
|
|
330
|
+
// q = 0x 20 21|82 23|84 25|86|
|
|
331
|
+
//
|
|
332
|
+
// But keys can still be short from segment 2 to 3, and thus we are violating
|
|
333
|
+
// the common segment prefix property for segment 2 (see n, p, q).
|
|
334
|
+
//
|
|
335
|
+
// Configuration: bytewise comp, BadVarInt0 format for segments 0-3 with
|
|
336
|
+
// segment 3 also continuing to the end of the key. No short key category.
|
|
337
|
+
// x = 0x 80 81|22 83|||
|
|
338
|
+
// y = 0x 80 81|22 83|24 85||
|
|
339
|
+
// z = 0x 80 81|22 83 84|25||
|
|
340
|
+
// m = 0x 80 82|22 83|||
|
|
341
|
+
// n = 0x 80 83|22 84|24 85||
|
|
342
|
+
//
|
|
343
|
+
// Even though this violates the segment maximal prefix property of the
|
|
344
|
+
// simplified model, the common segment prefix property and segment ordering
|
|
345
|
+
// property are satisfied for the various segment ordinals. In broader terms,
|
|
346
|
+
// the usual rule of the delimiter going with the segment before it can be
|
|
347
|
+
// violated if every byte value below some threshold starts a segment. (This
|
|
348
|
+
// has not been formally verified and is not recommended.)
|
|
349
|
+
//
|
|
350
|
+
// Suppose that we are paranoid, however, and decide to place short keys
|
|
351
|
+
// (empty in segment 2) into an unfiltered category. This is potentially a
|
|
352
|
+
// dangerous decision because loss of continuity at least affects the
|
|
353
|
+
// ability to filter on segment 0 (common segment prefix property violated
|
|
354
|
+
// with i=j=0; see z, m, n; m not in category set). Thus, excluding short keys
|
|
355
|
+
// with categories is not a recommended solution either.
|
|
164
356
|
class KeySegmentsExtractor {
|
|
165
357
|
public:
|
|
166
358
|
// The extractor assigns keys to categories so that it is easier to
|
|
@@ -269,6 +461,14 @@ class KeySegmentsExtractor {
|
|
|
269
461
|
Result* result) const = 0;
|
|
270
462
|
};
|
|
271
463
|
|
|
464
|
+
// Constructs a KeySegmentsExtractor for fixed-width key segments that safely
|
|
465
|
+
// handles short keys by truncating segments at the end of the input key.
|
|
466
|
+
// See comments on KeySegmentsExtractor for why this is much safer for
|
|
467
|
+
// filtering than "all or nothing" fixed-size segments. This is essentially
|
|
468
|
+
// a generalization of (New)CappedPrefixTransform.
|
|
469
|
+
std::shared_ptr<const KeySegmentsExtractor>
|
|
470
|
+
MakeSharedCappedKeySegmentsExtractor(const std::vector<size_t>& byte_widths);
|
|
471
|
+
|
|
272
472
|
// Alternatives for filtering inputs
|
|
273
473
|
|
|
274
474
|
// An individual key segment.
|
|
@@ -305,13 +505,13 @@ struct SelectUserTimestamp {};
|
|
|
305
505
|
|
|
306
506
|
struct SelectColumnName {};
|
|
307
507
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
//
|
|
508
|
+
// NOTE: more variants might be added in the future.
|
|
509
|
+
// NOTE2: filtering on values is not supported because it could easily break
|
|
510
|
+
// overwrite semantics. (Filter out SST with newer, non-matching value but
|
|
511
|
+
// see obsolete value that does match.)
|
|
311
512
|
using FilterInput =
|
|
312
513
|
std::variant<SelectWholeKey, SelectKeySegment, SelectKeySegmentRange,
|
|
313
|
-
SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName
|
|
314
|
-
SelectValue>;
|
|
514
|
+
SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName>;
|
|
315
515
|
|
|
316
516
|
// Base class for individual filtering schemes in terms of chosen
|
|
317
517
|
// FilterInputs, but not tied to a particular KeySegmentsExtractor.
|
|
@@ -336,6 +536,10 @@ std::shared_ptr<SstQueryFilterConfig> MakeSharedBytewiseMinMaxSQFC(
|
|
|
336
536
|
FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
|
|
337
537
|
KeySegmentsExtractor::KeyCategorySet::All());
|
|
338
538
|
|
|
539
|
+
std::shared_ptr<SstQueryFilterConfig> MakeSharedReverseBytewiseMinMaxSQFC(
|
|
540
|
+
FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
|
|
541
|
+
KeySegmentsExtractor::KeyCategorySet::All());
|
|
542
|
+
|
|
339
543
|
// TODO: more kinds of filters, eventually including Bloom/ribbon filters
|
|
340
544
|
// and replacing the old filter configuration APIs
|
|
341
545
|
|