@nxtedition/rocksdb 13.5.9 → 13.5.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/deps/rocksdb/rocksdb/BUCK +2 -1
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  3. package/deps/rocksdb/rocksdb/Makefile +1 -1
  4. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +4 -5
  5. package/deps/rocksdb/rocksdb/db/c.cc +13 -0
  6. package/deps/rocksdb/rocksdb/db/c_test.c +0 -12
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +8 -8
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +2 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +5 -4
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -1
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -10
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +11 -6
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +10 -16
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +2 -4
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -17
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +164 -0
  17. package/deps/rocksdb/rocksdb/db/corruption_test.cc +74 -3
  18. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +39 -4
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +2 -83
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -4
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +11 -11
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +0 -3
  23. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -9
  24. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +16 -54
  25. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +0 -6
  26. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +186 -0
  27. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +3 -40
  28. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -54
  29. package/deps/rocksdb/rocksdb/db/db_test.cc +0 -292
  30. package/deps/rocksdb/rocksdb/db/db_test2.cc +0 -1235
  31. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -0
  32. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +11 -4
  33. package/deps/rocksdb/rocksdb/db/log_reader.cc +11 -11
  34. package/deps/rocksdb/rocksdb/db/merge_helper.h +1 -1
  35. package/deps/rocksdb/rocksdb/db/multi_scan.cc +70 -0
  36. package/deps/rocksdb/rocksdb/db/version_set.cc +15 -8
  37. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +4 -0
  38. package/deps/rocksdb/rocksdb/env/composite_env.cc +4 -0
  39. package/deps/rocksdb/rocksdb/env/env.cc +4 -0
  40. package/deps/rocksdb/rocksdb/env/env_encryption.cc +38 -3
  41. package/deps/rocksdb/rocksdb/env/env_test.cc +36 -1
  42. package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -4
  43. package/deps/rocksdb/rocksdb/env/io_posix.cc +16 -0
  44. package/deps/rocksdb/rocksdb/env/io_posix.h +3 -0
  45. package/deps/rocksdb/rocksdb/env/mock_env.cc +5 -0
  46. package/deps/rocksdb/rocksdb/file/readahead_raf.cc +4 -0
  47. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -6
  48. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +5 -0
  49. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +25 -1
  50. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +10 -0
  51. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -0
  52. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +12 -0
  53. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +12 -8
  54. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +29 -28
  55. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +26 -6
  56. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +9 -0
  57. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +3 -0
  58. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +142 -0
  59. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +2 -0
  60. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +2 -2
  61. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +2 -0
  62. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  63. package/deps/rocksdb/rocksdb/options/options_helper.h +3 -0
  64. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -0
  65. package/deps/rocksdb/rocksdb/port/win/io_win.cc +20 -0
  66. package/deps/rocksdb/rocksdb/port/win/io_win.h +4 -0
  67. package/deps/rocksdb/rocksdb/src.mk +2 -1
  68. package/deps/rocksdb/rocksdb/table/block_based/block.cc +31 -34
  69. package/deps/rocksdb/rocksdb/table/block_based/block.h +2 -4
  70. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +43 -7
  71. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -0
  72. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +367 -2
  73. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +69 -23
  74. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +54 -6
  75. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +27 -5
  76. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +167 -3
  77. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +6 -2
  78. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +6 -0
  79. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +12 -0
  80. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +1 -0
  81. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -3
  82. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +10 -7
  83. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +244 -0
  84. package/deps/rocksdb/rocksdb/table/external_table.cc +1 -1
  85. package/deps/rocksdb/rocksdb/table/format.cc +51 -33
  86. package/deps/rocksdb/rocksdb/table/format.h +1 -1
  87. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +13 -8
  88. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +1 -3
  89. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +5 -1
  90. package/deps/rocksdb/rocksdb/table/table_test.cc +629 -1
  91. package/deps/rocksdb/rocksdb/test_util/testutil.cc +0 -1
  92. package/deps/rocksdb/rocksdb/test_util/testutil.h +5 -0
  93. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +183 -94
  94. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +71 -0
  95. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +37 -22
  96. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +308 -0
  97. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +189 -0
  98. package/deps/rocksdb/rocksdb/util/cast_util.h +22 -11
  99. package/deps/rocksdb/rocksdb/util/coding.h +4 -3
  100. package/deps/rocksdb/rocksdb/util/compression.cc +2 -0
  101. package/deps/rocksdb/rocksdb/util/compression.h +16 -6
  102. package/deps/rocksdb/rocksdb/util/compression_test.cc +1679 -15
  103. package/deps/rocksdb/rocksdb/util/stop_watch.h +17 -7
  104. package/deps/rocksdb/rocksdb/util/timer_queue_test.cc +17 -3
  105. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +10 -0
  106. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +5 -0
  107. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +2 -0
  108. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +18 -2
  109. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +22 -3
  110. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +5 -0
  111. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +22 -2
  112. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +15 -4
  113. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +61 -0
  114. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +18 -0
  115. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +3 -0
  116. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +3 -0
  117. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +9 -3
  118. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +9 -0
  119. package/deps/rocksdb/rocksdb.gyp +15 -1
  120. package/package.json +1 -1
  121. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  122. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  123. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +0 -131
  124. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +0 -90
@@ -57,6 +57,7 @@ struct WaitForCompactOptions;
57
57
  class Env;
58
58
  class EventListener;
59
59
  class FileSystem;
60
+ class MultiScan;
60
61
  class Replayer;
61
62
  class StatsHistoryIterator;
62
63
  class TraceReader;
@@ -1092,7 +1093,30 @@ class DB {
1092
1093
 
1093
1094
  // Get an iterator that scans multiple key ranges. The scan ranges should
1094
1095
  // be in increasing order of start key. See multi_scan_iterator.h for more
1095
- // details.
1096
+ // details. For optimal performance, ensure that either all entries in
1097
+ // scan_opts specify the range limit, or none of them do.
1098
+ //
1099
+ // NOTE: iterate_upper_bound in ReadOptions will be ignored. Instead, the
1100
+ // range.limit in ScanOptions is consulted to determine the upper bound key,
1101
+ // if specified.
1102
+ //
1103
+ // Example usage -
1104
+ // std::vector<ScanOptions> scans{{.start = Slice("bar")},
1105
+ // {.start = Slice("foo")}};
1106
+ // std::unique_ptr<MultiScan> iter.reset(
1107
+ // db->NewMultiScan());
1108
+ // try {
1109
+ // for (auto scan : *iter) {
1110
+ // for (auto it : scan) {
1111
+ // // Do something with key - it.first
1112
+ // // Do something with value - it.second
1113
+ // }
1114
+ // }
1115
+ // } catch (MultiScanException& ex) {
1116
+ // // Check ex.status()
1117
+ // } catch (std::logic_error& ex) {
1118
+ // // Check ex.what()
1119
+ // }
1096
1120
  virtual std::unique_ptr<MultiScan> NewMultiScan(
1097
1121
  const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
1098
1122
  const std::vector<ScanOptions>& /*scan_opts*/) {
@@ -866,6 +866,13 @@ class RandomAccessFile {
866
866
  "RandomAccessFile::InvalidateCache not supported.");
867
867
  }
868
868
 
869
+ // The default implementation returns "not supported" so that user
870
+ // implementations of FSRandomAccessFile do not need to immediately implement
871
+ // this function.
872
+ virtual Status GetFileSize(uint64_t* /*result*/) {
873
+ return Status::NotSupported("RandomAccessFile::GetFileSize not supported.");
874
+ }
875
+
869
876
  // If you're adding methods here, remember to add them to
870
877
  // RandomAccessFileWrapper too.
871
878
  };
@@ -1750,6 +1757,9 @@ class RandomAccessFileWrapper : public RandomAccessFile {
1750
1757
  Status InvalidateCache(size_t offset, size_t length) override {
1751
1758
  return target_->InvalidateCache(offset, length);
1752
1759
  }
1760
+ Status GetFileSize(uint64_t* file_size) override {
1761
+ return target_->GetFileSize(file_size);
1762
+ }
1753
1763
 
1754
1764
  private:
1755
1765
  RandomAccessFile* target_;
@@ -240,6 +240,15 @@ class EncryptedRandomAccessFile : public FSRandomAccessFile {
240
240
  size_t GetRequiredBufferAlignment() const override;
241
241
 
242
242
  IOStatus InvalidateCache(size_t offset, size_t length) override;
243
+
244
+ // Intentionally leave GetFileSize not overridden here, so that it inherits
245
+ // the default implementation from its parent class, which is Not Supported.
246
+ //
247
+ // As GetFileSize API is not required to be implemented yet, we use encrypted
248
+ // file system in unit test to validate the rest of the system could continue
249
+ // working with the Not Supported behavior.
250
+ //
251
+ // IOStatus GetFileSize(uint64_t* /*result*/) override;
243
252
  };
244
253
 
245
254
  class EncryptedWritableFile : public FSWritableFile {
@@ -1051,6 +1051,14 @@ class FSRandomAccessFile {
1051
1051
  // open.
1052
1052
  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
1053
1053
 
1054
+ // Get the file size on an open-for-reading file without re-seeking the file's
1055
+ // path in the filesystem. The default implementation returns "not supported"
1056
+ // so that user implementations of FSRandomAccessFile do not need to
1057
+ // immediately implement this function.
1058
+ virtual IOStatus GetFileSize(uint64_t* /*result*/) {
1059
+ return IOStatus::NotSupported("GetFileSize Not Supported");
1060
+ }
1061
+
1054
1062
  // If you're adding methods here, remember to add them to
1055
1063
  // RandomAccessFileWrapper too.
1056
1064
  };
@@ -1772,6 +1780,10 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile {
1772
1780
  return target_->GetTemperature();
1773
1781
  }
1774
1782
 
1783
+ virtual IOStatus GetFileSize(uint64_t* result) override {
1784
+ return target_->GetFileSize(result);
1785
+ }
1786
+
1775
1787
  private:
1776
1788
  std::unique_ptr<FSRandomAccessFile> guard_;
1777
1789
  FSRandomAccessFile* target_;
@@ -95,14 +95,18 @@ class Iterator : public IteratorBase {
95
95
  return Slice();
96
96
  }
97
97
 
98
- // RocksDB Internal - DO NOT USE
99
- // Prepare the iterator to scan the ranges specified in scan_opts. The
100
- // upper bound and other table specific limits may be specified. This will
101
- // typically be followed by Seeks to the start keys in the order they're
102
- // specified in scan_opts. If the user does a Seek to some other target key,
103
- // the iterator should disregard the scan_opts from that point onwards and
104
- // behave like a normal iterator. Its the user's responsibility to again
105
- // call Prepare().
98
+ // Prepare the iterator to scan the ranges specified in scan_opts. This
99
+ // includes prefetching relevant blocks from disk. The upper bound and
100
+ // other table specific limits should be specified for each
101
+ // scan for best results. If an upper bound is not specified, Prepare may
102
+ // skip prefetching as it cannot accurately determine how much to prefetch.
103
+ //
104
+ // Prepare should typically be followed by Seeks to the start keys in the
105
+ // order they're specified in scan_opts. If the user does a Seek to some
106
+ // other target key, the iterator should disregard the scan_opts from that
107
+ // point onwards and behave like a normal iterator. Its the user's
108
+ // responsibility to again call Prepare().
109
+ //
106
110
  // If Prepare() is called, it overrides the iterate_upper_bound in
107
111
  // ReadOptions
108
112
  virtual void Prepare(const std::vector<ScanOptions>& /*scan_opts*/) {}
@@ -5,6 +5,7 @@
5
5
 
6
6
  #pragma once
7
7
 
8
+ #include "rocksdb/db.h"
8
9
  #include "rocksdb/iterator.h"
9
10
  #include "rocksdb/options.h"
10
11
 
@@ -72,6 +73,8 @@ class Scan {
72
73
 
73
74
  explicit Scan(Iterator* db_iter) : db_iter_(db_iter) {}
74
75
 
76
+ void Reset(Iterator* db_iter) { db_iter_ = db_iter; }
77
+
75
78
  ScanIterator begin() { return ScanIterator(db_iter_); }
76
79
 
77
80
  std::nullptr_t end() { return nullptr; }
@@ -149,9 +152,9 @@ class Scan {
149
152
  // A Status exception is thrown if there is an error.
150
153
  class MultiScan {
151
154
  public:
152
- MultiScan(const std::vector<ScanOptions>& scan_opts,
153
- std::unique_ptr<Iterator>&& db_iter)
154
- : scan_opts_(scan_opts), db_iter_(std::move(db_iter)) {}
155
+ MultiScan(const ReadOptions& read_options,
156
+ const std::vector<ScanOptions>& scan_opts, DB* db,
157
+ ColumnFamilyHandle* cfh);
155
158
 
156
159
  explicit MultiScan(std::unique_ptr<Iterator>&& db_iter)
157
160
  : db_iter_(std::move(db_iter)) {}
@@ -168,9 +171,17 @@ class MultiScan {
168
171
  using difference_type = int;
169
172
  using iterator_category = std::input_iterator_tag;
170
173
 
171
- MultiScanIterator(const std::vector<ScanOptions>& scan_opts,
172
- Iterator* db_iter)
173
- : scan_opts_(scan_opts), idx_(0), db_iter_(db_iter), scan_(db_iter_) {
174
+ MultiScanIterator(const std::vector<ScanOptions>& scan_opts, DB* db,
175
+ ColumnFamilyHandle* cfh, ReadOptions& read_options,
176
+ Slice* upper_bound, std::unique_ptr<Iterator>& db_iter)
177
+ : scan_opts_(scan_opts),
178
+ db_(db),
179
+ cfh_(cfh),
180
+ read_options_(read_options),
181
+ upper_bound_(upper_bound),
182
+ idx_(0),
183
+ db_iter_(db_iter),
184
+ scan_(db_iter_.get()) {
174
185
  if (scan_opts_.empty()) {
175
186
  throw std::logic_error("Zero scans in multi-scan");
176
187
  }
@@ -181,28 +192,9 @@ class MultiScan {
181
192
  }
182
193
  }
183
194
 
184
- explicit MultiScanIterator(const std::vector<ScanOptions>& scan_opts)
185
- : scan_opts_(scan_opts),
186
- idx_(scan_opts_.size()),
187
- db_iter_(nullptr),
188
- scan_(nullptr) {}
189
-
190
195
  ~MultiScanIterator() { assert(status_.ok()); }
191
196
 
192
- MultiScanIterator& operator++() {
193
- if (idx_ >= scan_opts_.size()) {
194
- throw std::logic_error("Index out of range");
195
- }
196
- idx_++;
197
- if (idx_ < scan_opts_.size()) {
198
- db_iter_->Seek(*scan_opts_[idx_].range.start);
199
- status_ = db_iter_->status();
200
- if (!status_.ok()) {
201
- throw MultiScanException(status_);
202
- }
203
- }
204
- return *this;
205
- }
197
+ MultiScanIterator& operator++();
206
198
 
207
199
  bool operator==(std::nullptr_t /*other*/) const {
208
200
  return idx_ >= scan_opts_.size();
@@ -217,20 +209,29 @@ class MultiScan {
217
209
 
218
210
  private:
219
211
  const std::vector<ScanOptions>& scan_opts_;
212
+ DB* db_;
213
+ ColumnFamilyHandle* cfh_;
214
+ ReadOptions& read_options_;
215
+ Slice* upper_bound_;
220
216
  size_t idx_;
221
- Iterator* db_iter_;
217
+ std::unique_ptr<Iterator>& db_iter_;
222
218
  Status status_;
223
219
  Scan scan_;
224
220
  };
225
221
 
226
222
  MultiScanIterator begin() {
227
- return MultiScanIterator(scan_opts_, db_iter_.get());
223
+ return MultiScanIterator(scan_opts_, db_, cfh_, read_options_,
224
+ &upper_bound_, db_iter_);
228
225
  }
229
226
 
230
227
  std::nullptr_t end() { return nullptr; }
231
228
 
232
229
  private:
230
+ ReadOptions read_options_;
233
231
  const std::vector<ScanOptions> scan_opts_;
232
+ DB* db_;
233
+ ColumnFamilyHandle* cfh_;
234
+ Slice upper_bound_;
234
235
  std::unique_ptr<Iterator> db_iter_;
235
236
  };
236
237
 
@@ -57,6 +57,7 @@ class Statistics;
57
57
  class InternalKeyComparator;
58
58
  class WalFilter;
59
59
  class FileSystem;
60
+ class UserDefinedIndexFactory;
60
61
 
61
62
  struct Options;
62
63
  struct DbPath;
@@ -608,6 +609,13 @@ struct DBOptions {
608
609
  // checksums. True also enters a read-only mode when a DB write fails;
609
610
  // see DB::Resume().
610
611
  //
612
+ // When set to true, the DB will fail to open if any SST files fail to open
613
+ // e.g. due to incorrect file size or corrupted footer.
614
+ //
615
+ // When set to false, when there are files corrupted, the DB will still be
616
+ // opened, and the healthy ones could still be accessed, while corrupted one
617
+ // will not
618
+ //
611
619
  // As most workloads value data correctness over availability, this option
612
620
  // is on by default. Note that the name of this old option is potentially
613
621
  // misleading, and other options and operations go further in proactive
@@ -1297,12 +1305,13 @@ struct DBOptions {
1297
1305
  // Default: false
1298
1306
  bool skip_stats_update_on_db_open = false;
1299
1307
 
1300
- // If true, then DB::Open() will not fetch and check sizes of all sst files.
1301
- // This may significantly speed up startup if there are many sst files,
1302
- // especially when using non-default Env with expensive GetFileSize().
1303
- // We'll still check that all required sst files exist.
1304
- // If paranoid_checks is false, this option is ignored, and sst files are
1305
- // not checked at all.
1308
+ // This option is deprecated and marked as no-op. Kept for backward
1309
+ // compatibility until usage is fully removed.
1310
+ // File size check will be performed through a thread
1311
+ // pool during DB Open, when max_open_files is set to -1.
1312
+ // Therefore, the concern of DB Open slowness is eliminated.
1313
+ // Note that when max_open_files is not set to -1, only a subset of files will
1314
+ // be opened and checked during DB Open.
1306
1315
  //
1307
1316
  // Default: false
1308
1317
  bool skip_checking_sst_file_sizes_on_db_open = false;
@@ -2061,6 +2070,17 @@ struct ReadOptions {
2061
2070
  // Default: false
2062
2071
  bool auto_refresh_iterator_with_snapshot = false;
2063
2072
 
2073
+ // EXPERIMENTAL
2074
+ //
2075
+ // Specify an alternate index to use in the SST files instead of the native
2076
+ // block based table index. The table_factory used for the column family
2077
+ // must support building/reading this index.
2078
+ //
2079
+ // Currently, only forward scans are supported. For forward scans, only Seek()
2080
+ // is supported. SeekToFirst() is not supported. If the caller wishes to scan
2081
+ // from start to end, the native index must be used.
2082
+ const UserDefinedIndexFactory* table_index_factory = nullptr;
2083
+
2064
2084
  // *** END options only relevant to iterators or scans ***
2065
2085
 
2066
2086
  // *** BEGIN options for RocksDB internal use only ***
@@ -44,6 +44,7 @@ class TableReader;
44
44
  class WritableFileWriter;
45
45
  struct ConfigOptions;
46
46
  struct EnvOptions;
47
+ class UserDefinedIndexFactory;
47
48
 
48
49
  // Types of checksums to use for checking integrity of logical blocks within
49
50
  // files. All checksums currently use 32 bits of checking power (1 in 4B
@@ -492,8 +493,16 @@ struct BlockBasedTableOptions {
492
493
  // Because filters only impact performance and are not data-critical, an
493
494
  // SST file can be opened and used without filters if (a) the filter
494
495
  // policy name or schema is unrecognized, or (b) filter_policy is nullptr.
496
+ // See filter_policy regarding filters.
495
497
  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
496
498
 
499
+ // EXPERIMENTAL
500
+ //
501
+ // If non-nullptr, use the specified factory to build user-defined index.
502
+ // This allows users to define their own index format and build the index
503
+ // during table building.
504
+ std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory = nullptr;
505
+
497
506
  // If true, place whole keys in the filter (not just prefixes).
498
507
  // This must generally be true for gets to be efficient.
499
508
  bool whole_key_filtering = true;
@@ -69,6 +69,7 @@ class ToolHooks {
69
69
  virtual Status Open(const Options& options,
70
70
  const blob_db::BlobDBOptions& bdb_options,
71
71
  const std::string& dbname, blob_db::BlobDB** blob_db) = 0;
72
+ virtual void Exit(int status) = 0;
72
73
  };
73
74
 
74
75
  class DefaultHooks : public ToolHooks {
@@ -117,6 +118,8 @@ class DefaultHooks : public ToolHooks {
117
118
  const blob_db::BlobDBOptions& bdb_options,
118
119
  const std::string& dbname,
119
120
  blob_db::BlobDB** blob_db) override;
121
+
122
+ virtual void Exit(int status) override { exit(status); }
120
123
  };
121
124
 
122
125
  extern DefaultHooks defaultHooks;
@@ -0,0 +1,142 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+ //
6
+ // *****************************************************************
7
+ // EXPERIMENTAL - subject to change while under development
8
+ // *****************************************************************
9
+
10
+ #pragma once
11
+
12
+ #include <string>
13
+
14
+ #include "rocksdb/advanced_iterator.h"
15
+ #include "rocksdb/customizable.h"
16
+ #include "rocksdb/options.h"
17
+ #include "rocksdb/slice.h"
18
+ #include "rocksdb/status.h"
19
+
20
+ namespace ROCKSDB_NAMESPACE {
21
+
22
+ // Prefix for user-defined index block names
23
+ inline const std::string kUserDefinedIndexPrefix =
24
+ "rocksdb.user_defined_index.";
25
+
26
+ // This is a public API for user-defined index builders.
27
+ // It allows users to define their own index format and build custom
28
+ // indexes during table building. Currently, only a monolithic index
29
+ // block is supported (no partitioned index).
30
+
31
+ // The interface for building user-defined index.
32
+ class UserDefinedIndexBuilder {
33
+ public:
34
+ // Right now, we only support Puts. In the future, we may support merges,
35
+ // deletions etc.
36
+ enum ValueType {
37
+ kValue,
38
+ kTypeMax,
39
+ };
40
+
41
+ // File offset and size of the data block
42
+ struct BlockHandle {
43
+ uint64_t offset;
44
+ uint64_t size;
45
+ };
46
+
47
+ virtual ~UserDefinedIndexBuilder() = default;
48
+
49
+ // Add a new index entry to index block. The key for the new index entry
50
+ // should be >= last_key_in_current_block and < first_key_in_next_block.
51
+ // The previous index entry key and the new index entry key cover
52
+ // all the keys in the data block associated with the new index entry.
53
+ //
54
+ // Called before the OnKeyAdded() call for first_key_in_next_block.
55
+ // @last_key_in_current_block: The last key in the current data block
56
+ // @first_key_in_next_block: it will be nullptr if the entry being added is
57
+ // the last one in the table
58
+ // @block_handle: offset/size of the data block referenced by this index
59
+ // entry. This should be stored along with the index entry
60
+ // key
61
+ // @separator_scratch: a scratch buffer to back a computed separator between
62
+ // those, as needed. May be modified on each call.
63
+ // @return: the key or separator stored in the index, which could be
64
+ // last_key_in_current_block or a computed separator backed by
65
+ // separator_scratch.
66
+ virtual Slice AddIndexEntry(const Slice& last_key_in_current_block,
67
+ const Slice* first_key_in_next_block,
68
+ const BlockHandle& block_handle,
69
+ std::string* separator_scratch) = 0;
70
+
71
+ // This method will be called whenever a key is added. The subclasses may
72
+ // override OnKeyAdded() if they need to collect additional information.
73
+ // The type argument indicates whether the value is a full value or partial.
74
+ // At the moment, only full values are supported.
75
+ virtual void OnKeyAdded(const Slice& /*key*/, ValueType /*type*/,
76
+ const Slice& /*value*/) {}
77
+
78
+ // Finish building the index.
79
+ // Returns a Status and the serialized index contents.
80
+ // The memory backing the contents should not be freed until this builder
81
+ // object is destructed.
82
+ virtual Status Finish(Slice* index_contents) = 0;
83
+ };
84
+
85
+ // The interface for iterating the user defined index. This will be
86
+ // instantiated and used by a scan to iterate through the index entries
87
+ // covered by the scan.
88
+ class UserDefinedIndexIterator {
89
+ public:
90
+ virtual ~UserDefinedIndexIterator() = default;
91
+
92
+ // Prepare the iterator for a series of scans. The iterator should use
93
+ // this as an opportunity to do any prefetching and buffering of results.
94
+ virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0;
95
+
96
+ // Given the target key, position the index iterator at the index entry
97
+ // with the smallest key >= target. The result must be updated with the
98
+ // index key, and the bound_check_result. The bound_check_result should
99
+ // be set to kOutOfBound if no block satisfies the target key and
100
+ // termination criteria, kInbound if the data block is definitely fully
101
+ // within bounds, or kUnknown if the data block could be partially
102
+ // within bounds.
103
+ virtual Status SeekAndGetResult(const Slice& target,
104
+ IterateResult* result) = 0;
105
+
106
+ // Advance to the next index entry. The result must be populated similar
107
+ // to SeekAndGetResult.
108
+ virtual Status NextAndGetResult(IterateResult* result) = 0;
109
+
110
+ // Return the BlockHandle in the current index entry
111
+ virtual UserDefinedIndexBuilder::BlockHandle value() = 0;
112
+ };
113
+
114
+ // A reader interface for the user defined index
115
+ class UserDefinedIndexReader {
116
+ public:
117
+ virtual ~UserDefinedIndexReader() = default;
118
+
119
+ // Allocate an iterator that will be used by RocksDB to perform scans
120
+ virtual std::unique_ptr<UserDefinedIndexIterator> NewIterator(
121
+ const ReadOptions& read_options) = 0;
122
+
123
+ // The memory usage of the index, including the size of the raw contents and
124
+ // any other heap data structures allocated by the reader
125
+ virtual size_t ApproximateMemoryUsage() const = 0;
126
+ };
127
+
128
+ // Factory for creating user-defined index builders.
129
+ class UserDefinedIndexFactory : public Customizable {
130
+ public:
131
+ virtual ~UserDefinedIndexFactory() = default;
132
+
133
+ // Create a new builder for user-defined index.
134
+ virtual UserDefinedIndexBuilder* NewBuilder() const = 0;
135
+
136
+ // Create a new user defined index reader given the contents of the index
137
+ // block
138
+ virtual std::unique_ptr<UserDefinedIndexReader> NewReader(
139
+ Slice& index_block) const = 0;
140
+ };
141
+
142
+ } // namespace ROCKSDB_NAMESPACE
@@ -63,6 +63,8 @@ class DBWithTTL : public StackableDB {
63
63
 
64
64
  virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
65
65
 
66
+ virtual Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) = 0;
67
+
66
68
  protected:
67
69
  explicit DBWithTTL(DB* db) : StackableDB(db) {}
68
70
  };
@@ -396,9 +396,9 @@ struct TransactionOptions {
396
396
  // due to too many memtables.
397
397
  // Note that the ingestion relies on the transaction's underlying index,
398
398
  // (WriteBatchWithIndex), so updates that are added to the transaction
399
- // without indexing (e.g. added directly to the transaction underlying
399
+ // without indexing (i.e. added directly to the transaction underlying
400
400
  // write batch through Transaction::GetWriteBatch()->GetWriteBatch())
401
- // are not supported. They will not be applied to the DB.
401
+ // are not supported, and the optimization will not apply in that case.
402
402
  //
403
403
  // NOTE: since WBWI keep track of the most recent update per key, a Put
404
404
  // followed by a SingleDelete will be written to DB as a SingleDelete. This
@@ -379,6 +379,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
379
379
  };
380
380
  const std::unordered_map<uint32_t, CFStat>& GetCFStats() const;
381
381
 
382
+ // The total number of operations issued into this WBWI.
383
+ size_t GetWBWIOpCount() const;
382
384
  bool GetOverwriteKey() const;
383
385
 
384
386
  private:
@@ -12,8 +12,8 @@
12
12
  // NOTE: in 'main' development branch, this should be the *next*
13
13
  // minor or major version number planned for release.
14
14
  #define ROCKSDB_MAJOR 10
15
- #define ROCKSDB_MINOR 4
16
- #define ROCKSDB_PATCH 2
15
+ #define ROCKSDB_MINOR 5
16
+ #define ROCKSDB_PATCH 1
17
17
 
18
18
  // Do not use these. We made the mistake of declaring macros starting with
19
19
  // double underscore. Now we have to live with our choice. We'll deprecate these
@@ -72,6 +72,9 @@ std::unique_ptr<Configurable> CFOptionsAsConfigurable(
72
72
  Status StringToMap(const std::string& opts_str,
73
73
  std::unordered_map<std::string, std::string>* opts_map);
74
74
 
75
+ Status GetStringFromCompressionType(std::string* compression_str,
76
+ CompressionType compression_type);
77
+
75
78
  struct OptionsHelper {
76
79
  static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/;
77
80
  static const std::string kDBOptionsName /*= "DBOptions" */;
@@ -129,6 +129,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
129
129
  sizeof(CacheUsageOptions)},
130
130
  {offsetof(struct BlockBasedTableOptions, filter_policy),
131
131
  sizeof(std::shared_ptr<const FilterPolicy>)},
132
+ {offsetof(struct BlockBasedTableOptions, user_defined_index_factory),
133
+ sizeof(std::shared_ptr<UserDefinedIndexFactory>)},
132
134
  };
133
135
 
134
136
  // In this test, we catch a new option of BlockBasedTableOptions that is not
@@ -242,6 +242,16 @@ size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
242
242
  return GetUniqueIdFromFile(hFile_, id, max_size);
243
243
  }
244
244
 
245
+ IOStatus WinMmapReadableFile::GetFileSize(uint64_t* size) {
246
+ LARGE_INTEGER fileSize;
247
+ if (GetFileSizeEx(hFile_, &fileSize)) {
248
+ *size = fileSize.QuadPart;
249
+ return IOStatus::OK();
250
+ } else {
251
+ return IOStatus::IOError("Failed to get file size", filename_);
252
+ }
253
+ }
254
+
245
255
  ///////////////////////////////////////////////////////////////////////////////
246
256
  /// WinMmapFile
247
257
 
@@ -735,6 +745,16 @@ size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
735
745
  return GetAlignment();
736
746
  }
737
747
 
748
+ IOStatus WinRandomAccessFile::GetFileSize(uint64_t* size) {
749
+ LARGE_INTEGER fileSize;
750
+ if (GetFileSizeEx(hFile_, &fileSize)) {
751
+ *size = fileSize.QuadPart;
752
+ return IOStatus::OK();
753
+ } else {
754
+ return IOStatus::IOError("Failed to get file size", filename_);
755
+ }
756
+ }
757
+
738
758
  /////////////////////////////////////////////////////////////////////////////
739
759
  // WinWritableImpl
740
760
  //
@@ -152,6 +152,8 @@ class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
152
152
  IOStatus InvalidateCache(size_t offset, size_t length) override;
153
153
 
154
154
  size_t GetUniqueId(char* id, size_t max_size) const override;
155
+
156
+ IOStatus GetFileSize(uint64_t* file_size) override;
155
157
  };
156
158
 
157
159
  // We preallocate and use memcpy to append new
@@ -292,6 +294,8 @@ class WinRandomAccessFile
292
294
  IOStatus InvalidateCache(size_t offset, size_t length) override;
293
295
 
294
296
  size_t GetRequiredBufferAlignment() const override;
297
+
298
+ IOStatus GetFileSize(uint64_t* file_size) override;
295
299
  };
296
300
 
297
301
  // This is a sequential write class. It has been mimicked (as others) after
@@ -80,6 +80,7 @@ LIB_SOURCES = \
80
80
  db/memtable_list.cc \
81
81
  db/merge_helper.cc \
82
82
  db/merge_operator.cc \
83
+ db/multi_scan.cc \
83
84
  db/output_validator.cc \
84
85
  db/periodic_task_scheduler.cc \
85
86
  db/range_del_aggregator.cc \
@@ -237,13 +238,13 @@ LIB_SOURCES = \
237
238
  trace_replay/block_cache_tracer.cc \
238
239
  trace_replay/io_tracer.cc \
239
240
  util/async_file_reader.cc \
241
+ util/auto_tune_compressor.cc \
240
242
  util/build_version.cc \
241
243
  util/cleanable.cc \
242
244
  util/coding.cc \
243
245
  util/compaction_job_stats_impl.cc \
244
246
  util/comparator.cc \
245
247
  util/compression.cc \
246
- util/auto_skip_compressor.cc \
247
248
  util/compression_context_cache.cc \
248
249
  util/concurrent_task_limiter_impl.cc \
249
250
  util/crc32c.cc \