@nxtedition/rocksdb 12.1.4 → 12.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. package/binding.cc +2 -2
  2. package/deps/rocksdb/rocksdb/Makefile +10 -5
  3. package/deps/rocksdb/rocksdb/TARGETS +9 -7
  4. package/deps/rocksdb/rocksdb/cache/cache.cc +15 -11
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +26 -0
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +16 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +6 -0
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +38 -8
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +4 -0
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +11 -0
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +6 -0
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +56 -0
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +12 -9
  16. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +10 -0
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +9 -0
  18. package/deps/rocksdb/rocksdb/db/c.cc +9 -0
  19. package/deps/rocksdb/rocksdb/db/c_test.c +12 -1
  20. package/deps/rocksdb/rocksdb/db/column_family.cc +6 -23
  21. package/deps/rocksdb/rocksdb/db/column_family.h +1 -2
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +4 -5
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -4
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -6
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +19 -16
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +34 -30
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +2 -1
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +1 -1
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +16 -31
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +7 -50
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +95 -84
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +616 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +1 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +8 -2
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +93 -69
  40. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +353 -89
  41. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +4 -3
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +116 -14
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +67 -8
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +42 -14
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +50 -0
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +79 -32
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +36 -59
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +72 -39
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -12
  51. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +75 -0
  52. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -3
  53. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
  54. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +24 -0
  55. package/deps/rocksdb/rocksdb/db/db_test2.cc +36 -22
  56. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +23 -0
  57. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +2 -0
  58. package/deps/rocksdb/rocksdb/db/error_handler.cc +28 -3
  59. package/deps/rocksdb/rocksdb/db/error_handler.h +2 -1
  60. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  61. package/deps/rocksdb/rocksdb/db/experimental.cc +165 -33
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -5
  63. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +37 -28
  64. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -6
  65. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -6
  66. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -6
  67. package/deps/rocksdb/rocksdb/db/job_context.h +4 -0
  68. package/deps/rocksdb/rocksdb/db/memtable.cc +24 -14
  69. package/deps/rocksdb/rocksdb/db/memtable.h +2 -1
  70. package/deps/rocksdb/rocksdb/db/memtable_list.cc +61 -33
  71. package/deps/rocksdb/rocksdb/db/memtable_list.h +8 -0
  72. package/deps/rocksdb/rocksdb/db/repair.cc +4 -2
  73. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/version_builder.cc +14 -11
  75. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +20 -4
  76. package/deps/rocksdb/rocksdb/db/version_set.cc +40 -30
  77. package/deps/rocksdb/rocksdb/db/version_set.h +13 -3
  78. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -76
  79. package/deps/rocksdb/rocksdb/db/write_batch.cc +6 -2
  80. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +1 -1
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -1
  84. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +25 -2
  85. package/deps/rocksdb/rocksdb/env/fs_remap.cc +11 -0
  86. package/deps/rocksdb/rocksdb/env/fs_remap.h +5 -0
  87. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +11 -1
  88. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +3 -1
  89. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +20 -1
  90. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +10 -8
  91. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +4 -0
  92. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +30 -28
  93. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +10 -5
  94. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +3 -1
  95. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +287 -83
  96. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +68 -36
  97. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +8 -0
  98. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  100. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +4 -4
  101. package/deps/rocksdb/rocksdb/options/customizable_test.cc +31 -0
  102. package/deps/rocksdb/rocksdb/options/db_options.cc +14 -0
  103. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  104. package/deps/rocksdb/rocksdb/options/options_helper.cc +15 -4
  105. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  106. package/deps/rocksdb/rocksdb/options/options_parser.cc +5 -4
  107. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -1
  108. package/deps/rocksdb/rocksdb/options/options_test.cc +38 -45
  109. package/deps/rocksdb/rocksdb/port/port.h +16 -0
  110. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +8 -1
  111. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +10 -20
  112. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -9
  113. package/deps/rocksdb/rocksdb/table/format.cc +32 -4
  114. package/deps/rocksdb/rocksdb/table/format.h +12 -1
  115. package/deps/rocksdb/rocksdb/table/iterator.cc +4 -0
  116. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +214 -161
  117. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +4 -2
  118. package/deps/rocksdb/rocksdb/table/table_properties.cc +4 -0
  119. package/deps/rocksdb/rocksdb/table/table_reader.h +2 -2
  120. package/deps/rocksdb/rocksdb/table/table_test.cc +5 -4
  121. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
  122. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -0
  123. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -2
  124. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +213 -22
  125. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -0
  126. package/deps/rocksdb/rocksdb/util/async_file_reader.h +1 -1
  127. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +3 -0
  128. package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -2
  129. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +3 -3
  130. package/package.json +1 -1
  131. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  132. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -19,80 +19,82 @@ struct CompactionJobStats {
19
19
  void Add(const CompactionJobStats& stats);
20
20
 
21
21
  // the elapsed time of this compaction in microseconds.
22
- uint64_t elapsed_micros;
22
+ uint64_t elapsed_micros = 0;
23
23
 
24
24
  // the elapsed CPU time of this compaction in microseconds.
25
- uint64_t cpu_micros;
25
+ uint64_t cpu_micros = 0;
26
26
 
27
27
  // Used internally indicating whether a subcompaction's
28
28
  // `num_input_records` is accurate.
29
- bool has_num_input_records;
29
+ bool has_num_input_records = false;
30
30
  // the number of compaction input records.
31
- uint64_t num_input_records;
31
+ uint64_t num_input_records = 0;
32
32
  // the number of blobs read from blob files
33
- uint64_t num_blobs_read;
33
+ uint64_t num_blobs_read = 0;
34
34
  // the number of compaction input files (table files)
35
- size_t num_input_files;
35
+ size_t num_input_files = 0;
36
36
  // the number of compaction input files at the output level (table files)
37
- size_t num_input_files_at_output_level;
37
+ size_t num_input_files_at_output_level = 0;
38
38
 
39
39
  // the number of compaction output records.
40
- uint64_t num_output_records;
40
+ uint64_t num_output_records = 0;
41
41
  // the number of compaction output files (table files)
42
- size_t num_output_files;
42
+ size_t num_output_files = 0;
43
43
  // the number of compaction output files (blob files)
44
- size_t num_output_files_blob;
44
+ size_t num_output_files_blob = 0;
45
45
 
46
46
  // true if the compaction is a full compaction (all live SST files input)
47
- bool is_full_compaction;
47
+ bool is_full_compaction = false;
48
48
  // true if the compaction is a manual compaction
49
- bool is_manual_compaction;
49
+ bool is_manual_compaction = false;
50
+ // true if the compaction ran in a remote worker
51
+ bool is_remote_compaction = false;
50
52
 
51
53
  // the total size of table files in the compaction input
52
- uint64_t total_input_bytes;
54
+ uint64_t total_input_bytes = 0;
53
55
  // the total size of blobs read from blob files
54
- uint64_t total_blob_bytes_read;
56
+ uint64_t total_blob_bytes_read = 0;
55
57
  // the total size of table files in the compaction output
56
- uint64_t total_output_bytes;
58
+ uint64_t total_output_bytes = 0;
57
59
  // the total size of blob files in the compaction output
58
- uint64_t total_output_bytes_blob;
60
+ uint64_t total_output_bytes_blob = 0;
59
61
 
60
62
  // number of records being replaced by newer record associated with same key.
61
63
  // this could be a new value or a deletion entry for that key so this field
62
64
  // sums up all updated and deleted keys
63
- uint64_t num_records_replaced;
65
+ uint64_t num_records_replaced = 0;
64
66
 
65
67
  // the sum of the uncompressed input keys in bytes.
66
- uint64_t total_input_raw_key_bytes;
68
+ uint64_t total_input_raw_key_bytes = 0;
67
69
  // the sum of the uncompressed input values in bytes.
68
- uint64_t total_input_raw_value_bytes;
70
+ uint64_t total_input_raw_value_bytes = 0;
69
71
 
70
72
  // the number of deletion entries before compaction. Deletion entries
71
73
  // can disappear after compaction because they expired
72
- uint64_t num_input_deletion_records;
74
+ uint64_t num_input_deletion_records = 0;
73
75
  // number of deletion records that were found obsolete and discarded
74
76
  // because it is not possible to delete any more keys with this entry
75
77
  // (i.e. all possible deletions resulting from it have been completed)
76
- uint64_t num_expired_deletion_records;
78
+ uint64_t num_expired_deletion_records = 0;
77
79
 
78
80
  // number of corrupt keys (ParseInternalKey returned false when applied to
79
81
  // the key) encountered and written out.
80
- uint64_t num_corrupt_keys;
82
+ uint64_t num_corrupt_keys = 0;
81
83
 
82
84
  // Following counters are only populated if
83
85
  // options.report_bg_io_stats = true;
84
86
 
85
87
  // Time spent on file's Append() call.
86
- uint64_t file_write_nanos;
88
+ uint64_t file_write_nanos = 0;
87
89
 
88
90
  // Time spent on sync file range.
89
- uint64_t file_range_sync_nanos;
91
+ uint64_t file_range_sync_nanos = 0;
90
92
 
91
93
  // Time spent on file fsync.
92
- uint64_t file_fsync_nanos;
94
+ uint64_t file_fsync_nanos = 0;
93
95
 
94
96
  // Time spent on preparing file write (fallocate, etc)
95
- uint64_t file_prepare_write_nanos;
97
+ uint64_t file_prepare_write_nanos = 0;
96
98
 
97
99
  // 0-terminated strings storing the first 8 bytes of the smallest and
98
100
  // largest key in the output.
@@ -102,10 +104,10 @@ struct CompactionJobStats {
102
104
  std::string largest_output_key_prefix;
103
105
 
104
106
  // number of single-deletes which do not meet a put
105
- uint64_t num_single_del_fallthru;
107
+ uint64_t num_single_del_fallthru = 0;
106
108
 
107
109
  // number of single-deletes which meet something other than a put
108
- uint64_t num_single_del_mismatch;
110
+ uint64_t num_single_del_mismatch = 0;
109
111
 
110
112
  // TODO: Add output_to_penultimate_level output information
111
113
  };
@@ -179,13 +179,18 @@ class Comparator : public Customizable, public CompareInterface {
179
179
  size_t timestamp_size_;
180
180
  };
181
181
 
182
- // Return a builtin comparator that uses lexicographic byte-wise
183
- // ordering. The result remains the property of this module and
184
- // must not be deleted.
182
+ // Return a builtin comparator that uses lexicographic ordering
183
+ // on unsigned bytes, so the empty string is ordered before everything
184
+ // else and a sufficiently long string of \xFF orders after anything.
185
+ // CanKeysWithDifferentByteContentsBeEqual() == false
186
+ // Returns an immortal pointer that must not be deleted by the caller.
185
187
  const Comparator* BytewiseComparator();
186
188
 
187
- // Return a builtin comparator that uses reverse lexicographic byte-wise
188
- // ordering.
189
+ // Return a builtin comparator that is the reverse ordering of
190
+ // BytewiseComparator(), so the empty string is ordered after everything
191
+ // else and a sufficiently long string of \xFF orders before anything.
192
+ // CanKeysWithDifferentByteContentsBeEqual() == false
193
+ // Returns an immortal pointer that must not be deleted by the caller.
189
194
  const Comparator* ReverseBytewiseComparator();
190
195
 
191
196
  // Returns a builtin comparator that enables user-defined timestamps (formatted
@@ -56,7 +56,9 @@ struct ConfigOptions {
56
56
  // setting
57
57
  };
58
58
 
59
- // When true, any unused options will be ignored and OK will be returned
59
+ // When true, any unused options will be ignored and OK will be returned.
60
+ // For options files that appear to be from the current version or earlier,
61
+ // unknown options are considered corruption regardless of this setting.
60
62
  bool ignore_unknown_options = false;
61
63
 
62
64
  // When true, any unsupported options will be ignored and OK will be returned
@@ -61,83 +61,89 @@ Status UpdateManifestForFilesState(
61
61
  // EXPERIMENTAL new filtering features
62
62
  // ****************************************************************************
63
63
 
64
- // A class for splitting a key into meaningful pieces, or "segments" for
65
- // filtering purposes. Keys can also be put in "categories" to simplify
66
- // some configuration and handling. To simplify satisfying some filtering
67
- // requirements, the segments must encompass a complete key prefix (or the whole
68
- // key) and segments cannot overlap.
69
- //
70
- // Once in production, the behavior associated with a particular Name()
71
- // cannot change. Introduce a new Name() when introducing new behaviors.
64
+ // KeySegmentsExtractor - A class for splitting a key into meaningful pieces, or
65
+ // "segments" for filtering purposes. We say the first key segment has segment
66
+ // ordinal 0, the second has segment ordinal 1, etc. To simplify satisfying some
67
+ // filtering requirements, the segments must encompass a complete key prefix (or
68
+ // the whole key). There cannot be gaps between segments (though segments are
69
+ // allowed to be essentially unused), and segments cannot overlap.
70
+ //
71
+ // Keys can also be put in "categories" to simplify some configuration and
72
+ // handling. A "legal" key or bound is one that does not return an error (as a
73
+ // special, unused category) from the extractor. It is also allowed for all
74
+ // keys in a category to return an empty sequence of segments.
75
+ //
76
+ // To eliminate a confusing distinction between a segment that is empty vs.
77
+ // "not present" for a particular key, each key is logically assiciated with
78
+ // an infinite sequence of segments, including some infinite tail of 0-length
79
+ // segments. In practice, we only represent a finite sequence that (at least)
80
+ // covers the non-trivial segments.
81
+ //
82
+ // Once in production, the behavior associated with a particular GetId()
83
+ // cannot change. Introduce a new GetId() when introducing new behaviors.
72
84
  // See also SstQueryFilterConfigsManager below.
73
85
  //
74
- // OTHER CURRENT LIMITATIONS (maybe relaxed in the future for segments only
75
- // needing point query or WHERE filtering):
76
- // * Assumes the (default) byte-wise comparator is used.
77
- // * Assumes the category contiguousness property: that each category is
78
- // contiguous in comparator order. In other words, any key between two keys of
79
- // category c must also be in category c.
80
- // * Assumes the (weak) segment ordering property (described below) always
81
- // holds. (For byte-wise comparator, this is implied by the segment prefix
82
- // property, also described below.)
83
- // * Not yet compatible with user timestamp feature
84
- //
85
- // SEGMENT ORDERING PROPERTY: For maximum use in filters, especially for
86
- // filtering key range queries, we must have a correspondence between
87
- // the lexicographic ordering of key segments and the ordering of keys
88
- // they are extracted from. In other words, if we took the segmented keys
89
- // and ordered them primarily by (byte-wise) order on segment 0, then
90
- // on segment 1, etc., then key order of the original keys would not be
91
- // violated. This is the WEAK form of the property, where multiple keys
92
- // might generate the same segments, but such keys must be contiguous in
93
- // key order. (The STRONG form of the property is potentially more useful,
94
- // but for bytewise comparator, it can be inferred from segments satisfying
95
- // the weak property by assuming another segment that extends to the end of
96
- // the key, which would be empty if the segments already extend to the end
97
- // of the key.)
98
- //
99
- // The segment ordering property is hard to think about directly, but for
100
- // bytewise comparator, it is implied by a simpler property to reason about:
101
- // the segment prefix property (see below). (NOTE: an example way to satisfy
102
- // the segment ordering property while breaking the segment prefix property
103
- // is to have a segment delimited by any byte smaller than a certain value,
104
- // and not include the delimiter with the segment leading up to the delimiter.
105
- // For example, the space character is ordered before other printable
106
- // characters, so breaking "foo bar" into "foo", " ", and "bar" would be
107
- // legal, but not recommended.)
108
- //
109
- // SEGMENT PREFIX PROPERTY: If a key generates segments s0, ..., sn (possibly
110
- // more beyond sn) and sn does not extend to the end of the key, then all keys
111
- // starting with bytes s0+...+sn (concatenated) also generate the same segments
112
- // (possibly more). For example, if a key has segment s0 which is less than the
113
- // whole key and another key starts with the bytes of s0--or only has the bytes
114
- // of s0--then the other key must have the same segment s0. In other words, any
115
- // prefix of segments that might not extend to the end of the key must form an
116
- // unambiguous prefix code. See
117
- // https://en.wikipedia.org/wiki/Prefix_code In other other words, parsing
118
- // a key into segments cannot use even a single byte of look-ahead. Upon
119
- // processing each byte, the extractor decides whether to cut a segment that
120
- // ends with that byte, but not one that ends before that byte. The only
121
- // exception is that upon reaching the end of the key, the extractor can choose
122
- // whether to make a segment that ends at the end of the key.
86
+ // This feature hasn't yet been validated with user timestamp.
87
+ //
88
+ // = A SIMPLIFIED MODEL =
89
+ // Let us start with the easiest set of contraints to satisfy with a key
90
+ // segments extractor that generally allows for correct point and range
91
+ // filtering, and add complexity from there. Here we first assume
92
+ // * The column family is using the byte-wise comparator, or reverse byte-wise
93
+ // * A single category is assigned to all keys (by the extractor)
94
+ // * Using simplified criteria for legal segment extraction, the "segment
95
+ // maximal prefix property"
96
+ //
97
+ // SEGMENT MAXIMAL PREFIX PROPERTY: The segment that a byte is assigned to can
98
+ // only depend on the bytes that come before it, not on the byte itself nor
99
+ // anything later including the full length of the key or bound.
100
+ //
101
+ // Equivalently, two keys or bounds must agree on the segment assignment of
102
+ // position i if the two keys share a common byte-wise prefix up to at least
103
+ // position i - 1 (and i is within bounds of both keys).
104
+ //
105
+ // This specifically excludes "all or nothing" segments where it is only
106
+ // included if it reaches a particular width or delimiter. A segment resembling
107
+ // the FixedPrefixTransform would be illegal (without other assumptions); it
108
+ // must be like CappedPrefixTransform.
109
+ //
110
+ // This basically matches the notion of parsing prefix codes (see
111
+ // https://en.wikipedia.org/wiki/Prefix_code) except we have to include any
112
+ // partial segment (code word) at the end whenever an extension to that key
113
+ // might produce a full segment. An example would be parsing UTF-8 into
114
+ // segments corresponding to encoded code points, where any incomplete code
115
+ // at the end must be part of a trailing segment. Note a three-way
116
+ // correspondence between
117
+ // (a) byte-wise ordering of encoded code points, e.g.
118
+ // { D0 98 E2 82 AC }
119
+ // { E2 82 AC D0 98 }
120
+ // (b) lexicographic-then-byte-wise ordering of segments that are each an
121
+ // encoded code point, e.g.
122
+ // {{ D0 98 } { E2 82 AC }}
123
+ // {{ E2 82 AC } { D0 98 }}
124
+ // and (c) lexicographic ordering of the decoded code points, e.g.
125
+ // { U+0418 U+20AC }
126
+ // { U+20AC U+0418 }
127
+ // The correspondence between (a) and (b) is a result of the segment maximal
128
+ // prefix property and is critical for correct application of filters to
129
+ // range queries. The correspondence with (c) is a handy attribute of UTF-8
130
+ // (with no over-long encodings) and might be useful to the application.
123
131
  //
124
132
  // Example types of key segments that can be freely mixed in any order:
125
- // * Some fixed number of bytes or codewords.
126
- // * Ends in a delimiter byte or codeword. (Not including the delimiter as
127
- // part of the segment leading up to it would very likely violate the segment
128
- // prefix property.)
129
- // * Length-encoded sequence of bytes or codewords. The length could even
130
- // come from a preceding segment.
133
+ // * Capped number of bytes or codewords. The number cap for the segment
134
+ // could be the same for all keys or encoded earlier in the key.
135
+ // * Up to *and including* a delimiter byte or codeword.
131
136
  // * Any/all remaining bytes to the end of the key, though this implies all
132
137
  // subsequent segments will be empty.
133
- // For each kind of segment, it should be determined before parsing the segment
134
- // whether an incomplete/short parse will be treated as a segment extending to
135
- // the end of the key or as an empty segment.
138
+ // As part of the segment maximal prefix property, if the segments do not
139
+ // extend to the end of the key, that must be implied by the bytes that are
140
+ // in segments, NOT because the potential contents of a segment were considered
141
+ // incomplete.
136
142
  //
137
143
  // For example, keys might consist of
138
144
  // * Segment 0: Any sequence of bytes up to and including the first ':'
139
145
  // character, or the whole key if no ':' is present.
140
- // * Segment 1: The next four bytes, all or nothing (in case of short key).
146
+ // * Segment 1: The next four bytes, or less if we reach end of key.
141
147
  // * Segment 2: An unsigned byte indicating the number of additional bytes in
142
148
  // the segment, and then that many bytes (or less up to the end of the key).
143
149
  // * Segment 3: Any/all remaining bytes in the key
@@ -145,22 +151,208 @@ Status UpdateManifestForFilesState(
145
151
  // For an example of what can go wrong, consider using '4' as a delimiter
146
152
  // but not including it with the segment leading up to it. Suppose we have
147
153
  // these keys and corresponding first segments:
148
- // "123456" -> "123"
149
- // "124536" -> "12"
150
- // "125436" -> "125"
154
+ // "123456" -> "123" (in file 1)
155
+ // "124536" -> "12" (in file 2)
156
+ // "125436" -> "125" (in file 1)
151
157
  // Notice how byte-wise comparator ordering of the segments does not follow
152
158
  // the ordering of the keys. This means we cannot safely use a filter with
153
- // a range of segment values for filtering key range queries.
159
+ // a range of segment values for filtering key range queries. For example,
160
+ // we might get a range query for ["123", "125Z") and miss that key "124536"
161
+ // in file 2 is in range because its first segment "12" is out of the range
162
+ // of the first segments on the bounds, "123" and "125". We cannot even safely
163
+ // use this for prefix-like range querying with a Bloom filter on the segments.
164
+ // For a query ["12", "124Z"), segment "12" would likely not match the Bloom
165
+ // filter in file 1 and miss "123456".
154
166
  //
155
- // Also note that it is legal for all keys in a category (or many categories)
156
- // to return an empty sequence of segments.
167
+ // CATEGORIES: The KeySegmentsExtractor is allowed to place keys in categories
168
+ // so that different parts of the key space can use different filtering
169
+ // strategies. The following property is generally recommended for safe filter
170
+ // applicability
171
+ // * CATEGORY CONTIGUOUSNESS PROPERTY: each category is contiguous in
172
+ // comparator order. In other words, any key between two keys of category c
173
+ // must also be in category c.
174
+ // An alternative to categories when distinct kinds of keys are interspersed
175
+ // is to leave some segments empty when they do not apply to that key.
176
+ // Filters are generally set up to handle an empty segment specially so that
177
+ // it doesn't interfere with tracking accurate ranges on non-empty occurrences
178
+ // of the segment.
157
179
  //
158
- // To eliminate a confusing distinction between a segment that is empty vs.
159
- // "not present" for a particular key, each key is logically assiciated with
160
- // an infinite sequence of segments, including some infinite tail of 0-length
161
- // segments. In practice, we only represent a finite sequence that (at least)
162
- // covers the non-trivial segments.
180
+ // = BEYOND THE SIMPLIFIED MODEL =
181
+ //
182
+ // DETAILED GENERAL REQUIREMENTS (incl OTHER COMPARATORS): The exact
183
+ // requirements on a key segments extractor depend on whether and how we use
184
+ // filters to answer queries that they cannot answer directly. To understand
185
+ // this, we describe
186
+ // (A) the types of filters in terms of data they represent and can directly
187
+ // answer queries about,
188
+ // (B) the types of read queries that we want to use filters for, and
189
+ // (C) the assumptions that need to be satisfied to connect those two.
190
+ //
191
+ // TYPES OF FILTERS: Although not exhaustive, here are some useful categories
192
+ // of filter data:
193
+ // * Equivalence class filtering - Represents or over-approximates a set of
194
+ // equivalence classes on keys. The size of the representation is roughly
195
+ // proportional to the number of equivalence classes added. Bloom and ribbon
196
+ // filters are examples.
197
+ // * Order-based filtering - Represents one or more subranges of a key space or
198
+ // key segment space. A filter query only requires application of the CF
199
+ // comparator. The size of the representation is roughly proportional to the
200
+ // number of subranges and to the key or segment size. For example, we call a
201
+ // simple filter representing a minimum and a maximum value for a segment a
202
+ // min-max filter.
163
203
  //
204
+ // TYPES OF READ QUERIES and their DIRECT FILTERS:
205
+ // * Point query - Whether there {definitely isn't, might be} an entry for a
206
+ // particular key in an SST file (or partition, etc.).
207
+ // The DIRECT FILTER for a point query is an equivalence class filter on the
208
+ // whole key.
209
+ // * Range query - Whether there {definitely isn't, might be} any entries
210
+ // within a lower and upper key bound, in an SST file (or partition, etc.).
211
+ // NOTE: For this disucssion, we ignore the detail of inclusive vs.
212
+ // exclusive bounds by assuming a generalized notion of "bound" (vs. key)
213
+ // that conveniently represents spaces between keys. For details, see
214
+ // https://github.com/facebook/rocksdb/pull/11434
215
+ // The DIRECT FILTER for a range query is an order-based filter on the whole
216
+ // key (non-empty intersection of bounds/keys). Simple minimum and maximum
217
+ // keys for each SST file are automatically provided by metadata and used in
218
+ // the read path for filtering (as well as binary search indexing).
219
+ // PARTITIONING NOTE: SST metadata partitions do not have recorded minimum
220
+ // and maximum keys, so require some special handling for range query
221
+ // filtering. See https://github.com/facebook/rocksdb/pull/12872 etc.
222
+ // * Where clauses - Additional constraints that can be put on range queries.
223
+ // Specifically, a where clause is a tuple <i,j,c,b1,b2> representing that the
224
+ // concatenated sequence of segments from i to j (inclusive) compares between
225
+ // b1 and b2 according to comparator c.
226
+ // EXAMPLE: To represent that segment of ordinal i is equal to s, that would
227
+ // be <i,i,bytewise_comparator,before(s),after(s)>.
228
+ // NOTE: To represent something like segment has a particular prefix, you
229
+ // would need to split the key into more segments appropriately. There is
230
+ // little loss of generality because we can combine adjacent segments for
231
+ // specifying where clauses and implementing filters.
232
+ // The DIRECT FILTER for a where clause is an order-based filter on the same
233
+ // sequence of segments and comparator (non-empty intersection of bounds/keys),
234
+ // or in the special case of an equality clause (see example), an equivalence
235
+ // class filter on the sequence of segments.
236
+ //
237
+ // GENERALIZING FILTERS (INDIRECT):
238
+ // * Point queries can utilize essentially any kind of filter by extracting
239
+ // applicable segments of the query key (if not using whole key) and querying
240
+ // the corresponding equivalence class or trivial range.
241
+ // NOTE: There is NO requirement e.g. that the comparator used by the filter
242
+ // match the CF key comparator or similar. The extractor simply needs to be
243
+ // a pure function that does not return "out of bounds" segments.
244
+ // FOR EXAMPLE, a min-max filter on the 4th segment of keys can also be
245
+ // used for filtering point queries (Get/MultiGet) and could be as
246
+ // effective and much more space efficient than a Bloom filter, depending
247
+ // on the workload.
248
+ //
249
+ // Beyond point queries, we generally expect the key comparator to be a
250
+ // lexicographic / big endian ordering at a high level (or the reverse of that
251
+ // ordering), while each segment can use an arbitrary comparator.
252
+ // FOR EXAMPLE, with a custom key comparator and segments extractor,
253
+ // segment 0 could be a 4-byte unsigned little-endian integer,
254
+ // segment 1 could be an 8-byte signed big-endian integer. This framework
255
+ // requires segment 0 to come before segment 1 in the key and to take
256
+ // precedence in key ordering (i.e. segment 1 order is only consulted when
257
+ // keys are equal in segment 0).
258
+ //
259
+ // * Equivalence class filters can apply to range queries under conditions
260
+ // resembling legacy prefix filtering (prefix_extractor). An equivalence class
261
+ // filter on segments i through j and category set s is applicable to a range
262
+ // query from lb to ub if
263
+ // * All segments through j extracted from lb and ub are equal.
264
+ // NOTE: being in the same filtering equivalence class is insufficient, as
265
+ // that could be unrelated inputs with a hash collision. Here we are
266
+ // omitting details that would formally accommodate comparators in which
267
+ // different bytes can be considered equal.
268
+ // * The categories of lb and ub are in the category set s.
269
+ // * COMMON SEGMENT PREFIX PROPERTY (for all x, y, z; params j, s): if
270
+ // * Keys x and z have equal segments up through ordinal j, and
271
+ // * Keys x and z are in categories in category set s, and
272
+ // * Key y is ordered x < y < z according to the CF comparator,
273
+ // then both
274
+ // * Key y has equal segments up through ordinal j (compared to x and z)
275
+ // * Key y is in a category in category set s
276
+ // (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
277
+ // model.)
278
+ //
279
+ // * Order-based filters on segments (rather than whole key) can apply to range
280
+ // queries (with "whole key" bounds). Specifically, an order-based filter on
281
+ // segments i through j and category set s is applicable to a range query from
282
+ // lb to ub if
283
+ // * All segments through i-1 extracted from lb and ub are equal
284
+ // * The categories of lb and ub are in the category set s.
285
+ // * SEGMENT ORDERING PROPERTY for ordinal i through j, segments
286
+ // comparator c, category set s, for all x, y, and z: if
287
+ // * Keys x and z have equal segments up through ordinal i-1, and
288
+ // * Keys x and z are in categories in category set s, and
289
+ // * Key y is ordered x < y < z according to the CF comparator,
290
+ // then both
291
+ // * The common segment prefix property is satisifed through ordinal i-1
292
+ // and with category set s
293
+ // * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where
294
+ // x_i..j is the concatenation of segments i through j of key x (etc.).
295
+ // (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
296
+ // model.)
297
+ //
298
+ // INTERESTING EXAMPLES:
299
+ // Consider a segment encoding called BadVarInt1 in which a byte with
300
+ // highest-order bit 1 means "start a new segment". Also consider BadVarInt0
301
+ // which starts a new segment on highest-order bit 0.
302
+ //
303
+ // Configuration: bytewise comp, BadVarInt1 format for segments 0-3 with
304
+ // segment 3 also continuing to the end of the key
305
+ // x = 0x 20 21|82 23|||
306
+ // y = 0x 20 21|82 23 24|85||
307
+ // z = 0x 20 21|82 23|84 25||
308
+ //
309
+ // For i=j=1, this set of keys violate the common segment prefix property and
310
+ // segment ordering property, so can lead to incorrect equivalence class
311
+ // filtering or order-based filtering.
312
+ //
313
+ // Suppose we modify the configuration so that "short" keys (empty in segment
314
+ // 2) are placed in an unfiltered category. In that case, x above doesn't meet
315
+ // the precondition for being limited by segment properties. Consider these
316
+ // keys instead:
317
+ // x = 0x 20 21|82 23 24|85||
318
+ // y = 0x 20 21|82 23 24|85 26|87|
319
+ // z = 0x 20 21|82 23 24|85|86|
320
+ // m = 0x 20 21|82 23 25|85|86|
321
+ // n = 0x 20 21|82 23|84 25||
322
+ //
323
+ // Although segment 1 values might be out of order with key order,
324
+ // re-categorizing the short keys has allowed satisfying the common segment
325
+ // prefix property with j=1 (and with j=0), so we can use equivalence class
326
+ // filters on segment 1, or 0, or 0 to 1. However, violation of the segment
327
+ // ordering property on i=j=1 (see z, m, n) means we can't use order-based.
328
+ //
329
+ // p = 0x 20 21|82 23|84 25 26||
330
+ // q = 0x 20 21|82 23|84 25|86|
331
+ //
332
+ // But keys can still be short from segment 2 to 3, and thus we are violating
333
+ // the common segment prefix property for segment 2 (see n, p, q).
334
+ //
335
+ // Configuration: bytewise comp, BadVarInt0 format for segments 0-3 with
336
+ // segment 3 also continuing to the end of the key. No short key category.
337
+ // x = 0x 80 81|22 83|||
338
+ // y = 0x 80 81|22 83|24 85||
339
+ // z = 0x 80 81|22 83 84|25||
340
+ // m = 0x 80 82|22 83|||
341
+ // n = 0x 80 83|22 84|24 85||
342
+ //
343
+ // Even though this violates the segment maximal prefix property of the
344
+ // simplified model, the common segment prefix property and segment ordering
345
+ // property are satisfied for the various segment ordinals. In broader terms,
346
+ // the usual rule of the delimiter going with the segment before it can be
347
+ // violated if every byte value below some threshold starts a segment. (This
348
+ // has not been formally verified and is not recommended.)
349
+ //
350
+ // Suppose that we are paranoid, however, and decide to place short keys
351
+ // (empty in segment 2) into an unfiltered category. This is potentially a
352
+ // dangerous decision because loss of continuity at least affects the
353
+ // ability to filter on segment 0 (common segment prefix property violated
354
+ // with i=j=0; see z, m, n; m not in category set). Thus, excluding short keys
355
+ // with categories is not a recommended solution either.
164
356
  class KeySegmentsExtractor {
165
357
  public:
166
358
  // The extractor assigns keys to categories so that it is easier to
@@ -269,6 +461,14 @@ class KeySegmentsExtractor {
269
461
  Result* result) const = 0;
270
462
  };
271
463
 
464
+ // Constructs a KeySegmentsExtractor for fixed-width key segments that safely
465
+ // handles short keys by truncating segments at the end of the input key.
466
+ // See comments on KeySegmentsExtractor for why this is much safer for
467
+ // filtering than "all or nothing" fixed-size segments. This is essentially
468
+ // a generalization of (New)CappedPrefixTransform.
469
+ std::shared_ptr<const KeySegmentsExtractor>
470
+ MakeSharedCappedKeySegmentsExtractor(const std::vector<size_t>& byte_widths);
471
+
272
472
  // Alternatives for filtering inputs
273
473
 
274
474
  // An individual key segment.
@@ -305,13 +505,13 @@ struct SelectUserTimestamp {};
305
505
 
306
506
  struct SelectColumnName {};
307
507
 
308
- struct SelectValue {};
309
-
310
- // Note: more variants might be added in the future.
508
+ // NOTE: more variants might be added in the future.
509
+ // NOTE2: filtering on values is not supported because it could easily break
510
+ // overwrite semantics. (Filter out SST with newer, non-matching value but
511
+ // see obsolete value that does match.)
311
512
  using FilterInput =
312
513
  std::variant<SelectWholeKey, SelectKeySegment, SelectKeySegmentRange,
313
- SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName,
314
- SelectValue>;
514
+ SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName>;
315
515
 
316
516
  // Base class for individual filtering schemes in terms of chosen
317
517
  // FilterInputs, but not tied to a particular KeySegmentsExtractor.
@@ -336,6 +536,10 @@ std::shared_ptr<SstQueryFilterConfig> MakeSharedBytewiseMinMaxSQFC(
336
536
  FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
337
537
  KeySegmentsExtractor::KeyCategorySet::All());
338
538
 
539
+ std::shared_ptr<SstQueryFilterConfig> MakeSharedReverseBytewiseMinMaxSQFC(
540
+ FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
541
+ KeySegmentsExtractor::KeyCategorySet::All());
542
+
339
543
  // TODO: more kinds of filters, eventually including Bloom/ribbon filters
340
544
  // and replacing the old filter configuration APIs
341
545