@nxtedition/rocksdb 15.1.2 → 15.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/binding.cc +79 -38
  2. package/build.sh +1 -2
  3. package/deps/rocksdb/rocksdb/BUCK +10 -8
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +27 -2
  5. package/deps/rocksdb/rocksdb/Makefile +27 -116
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -1
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +101 -124
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.h +47 -30
  9. package/deps/rocksdb/rocksdb/db/c.cc +793 -131
  10. package/deps/rocksdb/rocksdb/db/c_test.c +571 -0
  11. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +226 -0
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -0
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +95 -59
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +2 -2
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +45 -35
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -4
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -6
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +8 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +47 -0
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -2
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +82 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  24. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
  25. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +69 -24
  26. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +9 -1
  27. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +65 -0
  28. package/deps/rocksdb/rocksdb/db/db_etc3_test.cc +161 -0
  29. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -0
  30. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +20 -7
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +13 -0
  32. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +114 -39
  33. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  34. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +3 -3
  35. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  36. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +39 -25
  37. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +361 -0
  38. package/deps/rocksdb/rocksdb/db/db_options_test.cc +35 -0
  39. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +83 -0
  40. package/deps/rocksdb/rocksdb/db/db_test.cc +249 -4
  41. package/deps/rocksdb/rocksdb/db/db_test2.cc +3 -0
  42. package/deps/rocksdb/rocksdb/db/db_test_util.cc +2 -1
  43. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +3 -2
  44. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -7
  45. package/deps/rocksdb/rocksdb/db/listener_test.cc +7 -17
  46. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +4 -2
  47. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +41 -0
  48. package/deps/rocksdb/rocksdb/db/repair.cc +2 -2
  49. package/deps/rocksdb/rocksdb/db/version_edit.h +7 -4
  50. package/deps/rocksdb/rocksdb/db/version_set.cc +299 -90
  51. package/deps/rocksdb/rocksdb/db/version_set.h +56 -9
  52. package/deps/rocksdb/rocksdb/db/version_set_test.cc +41 -39
  53. package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
  54. package/deps/rocksdb/rocksdb/db/wal_manager.cc +7 -1
  55. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +48 -10
  56. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  57. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
  58. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +16 -5
  59. package/deps/rocksdb/rocksdb/env/env_test.cc +126 -41
  60. package/deps/rocksdb/rocksdb/env/fs_posix.cc +14 -7
  61. package/deps/rocksdb/rocksdb/env/io_posix.cc +304 -112
  62. package/deps/rocksdb/rocksdb/env/io_posix.h +16 -4
  63. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  64. package/deps/rocksdb/rocksdb/folly.mk +148 -0
  65. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +29 -3
  66. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +73 -0
  67. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +246 -0
  68. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +0 -2
  69. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +15 -9
  70. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +19 -9
  71. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -1
  72. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -4
  73. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +14 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +67 -6
  75. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +1 -7
  76. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
  77. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +6 -14
  78. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +8 -1
  79. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +2 -2
  80. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +0 -4
  81. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +33 -5
  82. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +6 -0
  83. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  84. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +2 -0
  85. package/deps/rocksdb/rocksdb/monitoring/thread_status_impl.cc +5 -2
  86. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +2 -2
  87. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.h +6 -6
  88. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc +2 -2
  89. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +10 -5
  90. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.h +2 -2
  91. package/deps/rocksdb/rocksdb/options/cf_options.cc +15 -3
  92. package/deps/rocksdb/rocksdb/options/cf_options.h +7 -0
  93. package/deps/rocksdb/rocksdb/options/db_options.cc +27 -36
  94. package/deps/rocksdb/rocksdb/options/db_options.h +3 -2
  95. package/deps/rocksdb/rocksdb/options/options.cc +4 -0
  96. package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -2
  97. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -1
  98. package/deps/rocksdb/rocksdb/options/options_test.cc +19 -3
  99. package/deps/rocksdb/rocksdb/src.mk +1 -1
  100. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +155 -32
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +7 -3
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +169 -125
  103. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +22 -7
  104. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +43 -24
  105. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +9 -5
  106. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +9 -8
  107. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +17 -0
  108. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -5
  109. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +13 -18
  110. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +29 -0
  111. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +6 -0
  112. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +15 -0
  113. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +79 -19
  114. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +48 -20
  115. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +51 -0
  116. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +19 -0
  117. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +1 -1
  118. package/deps/rocksdb/rocksdb/table/external_table.cc +2 -2
  119. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +3 -2
  120. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +3 -1
  121. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  122. package/deps/rocksdb/rocksdb/table/table_reader.h +4 -2
  123. package/deps/rocksdb/rocksdb/table/table_test.cc +48 -39
  124. package/deps/rocksdb/rocksdb/test_util/sync_point.cc +4 -0
  125. package/deps/rocksdb/rocksdb/test_util/sync_point.h +32 -0
  126. package/deps/rocksdb/rocksdb/test_util/testutil.h +6 -2
  127. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +14 -4
  128. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
  129. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +3 -2
  130. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +63 -12
  131. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +16 -1
  132. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +5 -1
  133. package/deps/rocksdb/rocksdb/util/bit_fields.h +133 -23
  134. package/deps/rocksdb/rocksdb/util/bloom_test.cc +2 -5
  135. package/deps/rocksdb/rocksdb/util/compression.cc +51 -23
  136. package/deps/rocksdb/rocksdb/util/compression_test.cc +525 -270
  137. package/deps/rocksdb/rocksdb/util/filter_bench.cc +3 -4
  138. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +11 -2
  139. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -1
  140. package/deps/rocksdb/rocksdb/util/slice_test.cc +92 -0
  141. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +2 -2
  142. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -2
  143. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +2 -2
  144. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +19 -2
  145. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +75 -0
  146. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +1 -0
  147. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +303 -111
  148. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +379 -0
  149. package/deps/rocksdb/rocksdb.gyp +6 -4
  150. package/iterator.js +66 -70
  151. package/package.json +6 -6
  152. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  153. package/deps/rocksdb/rocksdb/table/block_based/index_builder_test.cc +0 -183
@@ -143,6 +143,7 @@ void PartitionedFilterBlockBuilder::CutAFilterBlock(const Slice* next_key,
143
143
  ikey = p_index_builder_->GetPartitionKey();
144
144
  }
145
145
  filters_.push_back({std::move(ikey), std::move(filter_data), filter});
146
+ completed_partitions_size_.FetchAddRelaxed(filter.size());
146
147
  partitioned_filters_construction_status_.UpdateIfOk(
147
148
  filter_construction_status);
148
149
 
@@ -209,6 +210,56 @@ size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
209
210
  return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
210
211
  }
211
212
 
213
+ size_t PartitionedFilterBlockBuilder::CurrentFilterSizeEstimate() {
214
+ size_t active_partition_size =
215
+ filter_bits_builder_->EstimateEntriesAdded() * 2; // 2 bytes per key
216
+
217
+ return estimated_filter_size_.LoadRelaxed() + active_partition_size;
218
+ }
219
+
220
+ void PartitionedFilterBlockBuilder::OnDataBlockFinalized(
221
+ uint64_t num_data_blocks) {
222
+ UpdateFilterSizeEstimate(num_data_blocks);
223
+ }
224
+
225
+ void PartitionedFilterBlockBuilder::UpdateFilterSizeEstimate(
226
+ uint64_t num_data_blocks) {
227
+ size_t partitions_size = completed_partitions_size_.LoadRelaxed();
228
+
229
+ // Reserve space if no partitions have been cut
230
+ size_t active_filter_estimate = 0;
231
+ if (partitions_size == 0) {
232
+ size_t avg_bytes_per_entry =
233
+ 2; // 2 bytes per entry, approx 15 bits per key
234
+
235
+ // Estimate using keys_per_partition_ since we expect to cut the first
236
+ // partition once it reaches approx. this many entries.
237
+ active_filter_estimate = keys_per_partition_ * avg_bytes_per_entry;
238
+
239
+ // Add a 2x buffer (for top-level index, etc.)
240
+ active_filter_estimate = active_filter_estimate * 2;
241
+ }
242
+ size_t filter_estimate = std::max(partitions_size, active_filter_estimate);
243
+
244
+ // Estimate top-level partition index size
245
+ if (p_index_builder_->separator_is_key_plus_seq()) {
246
+ filter_estimate += index_on_filter_block_builder_.CurrentSizeEstimate();
247
+ } else {
248
+ filter_estimate +=
249
+ index_on_filter_block_builder_without_seq_.CurrentSizeEstimate();
250
+ }
251
+
252
+ // Reserve filter space for the next data block
253
+ size_t reserved = 0;
254
+ if (num_data_blocks > 0) {
255
+ reserved = (filter_estimate / num_data_blocks) *
256
+ 2; // 2x average size per data block
257
+ estimated_filter_size_.StoreRelaxed(filter_estimate + reserved);
258
+ } else {
259
+ estimated_filter_size_.StoreRelaxed(filter_estimate);
260
+ }
261
+ }
262
+
212
263
  void PartitionedFilterBlockBuilder::PrevKeyBeforeFinish(
213
264
  const Slice& prev_key_without_ts) {
214
265
  assert(prev_key_without_ts.compare(DEBUG_add_with_prev_key_called_
@@ -18,6 +18,7 @@
18
18
  #include "table/block_based/filter_block_reader_common.h"
19
19
  #include "table/block_based/full_filter_block.h"
20
20
  #include "table/block_based/index_builder.h"
21
+ #include "util/atomic.h"
21
22
  #include "util/autovector.h"
22
23
  #include "util/hash_containers.h"
23
24
 
@@ -46,6 +47,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
46
47
  }
47
48
 
48
49
  size_t EstimateEntriesAdded() override;
50
+ size_t CurrentFilterSizeEstimate() override;
51
+ void OnDataBlockFinalized(uint64_t num_data_blocks) override;
49
52
 
50
53
  void PrevKeyBeforeFinish(const Slice& prev_key_without_ts) override;
51
54
  Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter,
@@ -67,6 +70,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
67
70
  return Status::OK();
68
71
  }
69
72
 
73
+ protected:
74
+ // Needs to be thread-safe to be invoked from background worker
75
+ // thread when parallel compression is enabled.
76
+ void UpdateFilterSizeEstimate(uint64_t num_data_blocks) override;
77
+
70
78
  private: // fns
71
79
  // Whether to cut a filter block before the next key
72
80
  bool DecideCutAFilterBlock();
@@ -92,6 +100,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
92
100
  };
93
101
  std::deque<FilterEntry> filters_; // list of partitioned filters and keys
94
102
  // used in building the index
103
+ // Running total of completed filter partition sizes to avoid
104
+ // iterating over filters_ deque, which can be concurrently modified by
105
+ // the main thread when parallel compression is enabled.
106
+ RelaxedAtomic<size_t> completed_partitions_size_{0};
107
+
95
108
  // The desired number of keys per partition
96
109
  uint32_t keys_per_partition_;
97
110
  // According to the bits builders, how many keys/prefixes added
@@ -107,6 +120,12 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
107
120
  // For Add without prev key
108
121
  std::string prev_key_without_ts_;
109
122
 
123
+ // Cached filter size estimate for hot path performance - updated only when
124
+ // data blocks are written for meaningful estimate updates.
125
+ // Must be atomic since UpdateFilterSizeEstimate() can be called from
126
+ // background worker threads when parallel compression is enabled.
127
+ RelaxedAtomic<size_t> estimated_filter_size_{0};
128
+
110
129
  #ifndef NDEBUG
111
130
  // For verifying accurate previous keys are provided by the caller, so that
112
131
  // release code can be fast
@@ -158,7 +158,7 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
158
158
 
159
159
  size_t IndexSize() const override { return index_size_; }
160
160
 
161
- uint64_t EstimateCurrentIndexSize() const override { return 0; }
161
+ uint64_t CurrentIndexSizeEstimate() const override { return 0; }
162
162
 
163
163
  bool separator_is_key_plus_seq() override {
164
164
  return internal_index_builder_->separator_is_key_plus_seq();
@@ -239,8 +239,8 @@ class ExternalTableReaderAdapter : public TableReader {
239
239
  "Get() not supported on external file iterator");
240
240
  }
241
241
 
242
- virtual Status VerifyChecksum(const ReadOptions& /*ro*/,
243
- TableReaderCaller /*caller*/) override {
242
+ Status VerifyChecksum(const ReadOptions& /*ro*/, TableReaderCaller /*caller*/,
243
+ bool /*meta_blocks_only*/ = false) override {
244
244
  return Status::OK();
245
245
  }
246
246
 
@@ -47,12 +47,13 @@ SstFileDumper::SstFileDumper(const Options& options,
47
47
  Temperature file_temp, size_t readahead_size,
48
48
  bool verify_checksum, bool output_hex,
49
49
  bool decode_blob_index, const EnvOptions& soptions,
50
- bool silent)
50
+ bool silent, bool show_sequence_number_type)
51
51
  : file_name_(file_path),
52
52
  read_num_(0),
53
53
  file_temp_(file_temp),
54
54
  output_hex_(output_hex),
55
55
  decode_blob_index_(decode_blob_index),
56
+ show_sequence_number_type_(show_sequence_number_type),
56
57
  soptions_(soptions),
57
58
  silent_(silent),
58
59
  options_(options),
@@ -220,7 +221,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
220
221
  Env* env = options_.env;
221
222
  Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
222
223
  if (s.ok()) {
223
- s = table_reader_->DumpTable(out_file.get());
224
+ s = table_reader_->DumpTable(out_file.get(), show_sequence_number_type_);
224
225
  }
225
226
  if (!s.ok()) {
226
227
  // close the file before return error, ignore the close error if there's any
@@ -21,7 +21,8 @@ class SstFileDumper {
21
21
  bool verify_checksum, bool output_hex,
22
22
  bool decode_blob_index,
23
23
  const EnvOptions& soptions = EnvOptions(),
24
- bool silent = false);
24
+ bool silent = false,
25
+ bool show_sequence_number_type = false);
25
26
 
26
27
  // read_num_limit limits the total number of keys read. If read_num_limit = 0,
27
28
  // then there is no limit. If read_num_limit = 0 or
@@ -79,6 +80,7 @@ class SstFileDumper {
79
80
  Temperature file_temp_;
80
81
  bool output_hex_;
81
82
  bool decode_blob_index_;
83
+ bool show_sequence_number_type_;
82
84
  EnvOptions soptions_;
83
85
  // less verbose in stdout/stderr
84
86
  bool silent_;
@@ -225,6 +225,11 @@ class TableBuilder {
225
225
  // is enabled.
226
226
  virtual uint64_t EstimatedFileSize() const { return FileSize(); }
227
227
 
228
+ // Estimated tail size of the SST file generated so far. The "tail" refers to
229
+ // all blocks written after data blocks (index + filter). This value helps
230
+ // estimate the total file size when deciding when to cut files.
231
+ virtual uint64_t EstimatedTailSize() const { return 0; }
232
+
228
233
  virtual uint64_t GetTailSize() const { return 0; }
229
234
 
230
235
  // If the user defined table properties collector suggest the file to
@@ -179,13 +179,15 @@ class TableReader {
179
179
  }
180
180
 
181
181
  // convert db file to a human readable form
182
- virtual Status DumpTable(WritableFile* /*out_file*/) {
182
+ virtual Status DumpTable(WritableFile* /*out_file*/,
183
+ bool /*show_sequence_number_type*/ = false) {
183
184
  return Status::NotSupported("DumpTable() not supported");
184
185
  }
185
186
 
186
187
  // check whether there is corruption in this db file
187
188
  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
188
- TableReaderCaller /*caller*/) {
189
+ TableReaderCaller /*caller*/,
190
+ bool /*meta_blocks_only*/ = false) {
189
191
  return Status::NotSupported("VerifyChecksum() not supported");
190
192
  }
191
193
 
@@ -9140,6 +9140,7 @@ class UserDefinedIndexStressTest
9140
9140
  const std::vector<DataRange>& ranges,
9141
9141
  bool& data_added) {
9142
9142
  std::unique_ptr<SstFileWriter> writer;
9143
+
9143
9144
  data_added = false;
9144
9145
 
9145
9146
  std::vector<DataRange> ranges_in_file;
@@ -9151,9 +9152,12 @@ class UserDefinedIndexStressTest
9151
9152
  }
9152
9153
 
9153
9154
  if (writer == nullptr) {
9155
+ // lazy create writer until there is data to be written to avoid
9156
+ // unchecked status error
9154
9157
  writer = std::make_unique<SstFileWriter>(EnvOptions(), options_);
9155
9158
  ASSERT_OK(writer->Open(ingest_file));
9156
9159
  }
9160
+
9157
9161
  ranges_in_file.push_back(range);
9158
9162
 
9159
9163
  data_added = true;
@@ -9256,6 +9260,7 @@ class UserDefinedIndexStressTest
9256
9260
  if (kVerbose) {
9257
9261
  std::cout << "iteration " << i << std::endl;
9258
9262
  }
9263
+ SCOPED_TRACE("Iteration " + std::to_string(i));
9259
9264
  // randomly generate 1 to 3 ranges
9260
9265
  auto ranges = GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "");
9261
9266
 
@@ -9299,13 +9304,14 @@ class UserDefinedIndexStressTest
9299
9304
  size_t& ingest_file_count,
9300
9305
  const IngestExternalFileOptions& ifo,
9301
9306
  bool combine_ranges = false) {
9302
- std::vector<std::string> ingest_files;
9303
9307
  // Generate SST file and bulk load them one level at a time
9308
+ std::vector<std::string> ingest_files;
9304
9309
  if (combine_ranges) {
9305
9310
  size_t i = 0;
9306
9311
  while (i < ranges_in_level.size()) {
9307
9312
  // if combine ranges, generate 1 SST file that combines muliple ranges
9308
9313
  // together
9314
+ // Randomly combine ranges to SST file.
9309
9315
  size_t batch_end_idx =
9310
9316
  std::min(i + rnd.Uniform(3) + 2, ranges_in_level.size());
9311
9317
  bool data_added = false;
@@ -9361,22 +9367,7 @@ class UserDefinedIndexStressTest
9361
9367
  }
9362
9368
  };
9363
9369
 
9364
- // TODO(xingbo)
9365
- // This test is disabled due to following test case condition:
9366
- // level n: delete range 4-6
9367
- // level n+1: data range 0-------10
9368
- // query: 3-9, count=2.
9369
- // Becuase query count == 2, level n+1 would only prepare 3-5. but since 4-6
9370
- // got deleted in the upper level, they are not returned, so only 3 is
9371
- // returned. Meantime the query should have return [3, 6]
9372
- // One way to fix this is by preparing more data blocks once prepared blocks
9373
- // are exhausted, but upper bound is not reached yet. This requires following
9374
- // changes:
9375
- // 1. Fix out of bound flag in block table iterator. Only set it if the key is
9376
- // larger than the upper bound.
9377
- // 2. Refactor the prepared block single dimension vector into 2 dimension of
9378
- // vectors, so that more blocks could be prepared if needed.
9379
- TEST_P(UserDefinedIndexStressTest, DISABLED_PartialDeleteRange) {
9370
+ TEST_P(UserDefinedIndexStressTest, PartialDeleteRange) {
9380
9371
  // Create 2 column families. One use normal put/del, the other uses sst
9381
9372
  // ingest Randomly generate multiple non overlapping range for multiple
9382
9373
  // levels Range scan same range between the 2 CF and validate the result is
@@ -9387,6 +9378,22 @@ TEST_P(UserDefinedIndexStressTest, DISABLED_PartialDeleteRange) {
9387
9378
  SCOPED_TRACE("dbname: " + dbname_);
9388
9379
  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
9389
9380
 
9381
+ if (enable_udi_) {
9382
+ // Skip UDI for now.
9383
+ // The issue is that with UDI enabled, prepare might not prepare enough keys
9384
+ // at lower level due to range delete from upper level.
9385
+ // E.g. consider a LSM tree:
9386
+ // L1: Data [0-1]
9387
+ // L2: Delete Range [0-6]
9388
+ // L3: Data [0-9]
9389
+ // When multiscan queries range [0-9) with UDI count as 3, the L3 file
9390
+ // will only prepare range [0-3). However, this range is masked out by upper
9391
+ // layer delete range from [0-6] from L2. This causes query to only return
9392
+ // [0,1], while [0,1,7] is the right result. Until prepare is able to
9393
+ // preparing additional block supported, UDI is skipped.
9394
+ return;
9395
+ }
9396
+
9390
9397
  for (int i = 0; i < 5; i++) {
9391
9398
  ranges_in_levels_.push_back(
9392
9399
  GenerateKeyRanges(rnd.Uniform(3) + 4, 2,
@@ -9408,9 +9415,9 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
9408
9415
  // Create 2 column families. One use normal put/del, the other uses sst
9409
9416
  // ingest.
9410
9417
  // Test the case where there are 3 levels, the middle level is a delete
9411
- // range file that span across the entire key space. The top level file have
9412
- // multiple files and each one has both data and delete range Scan same
9413
- // range between the 2 CF and validate the result is same
9418
+ // range file that span across the entire key space. The top and bottom level
9419
+ // file have multiple files and each one has both data and delete range. Scan
9420
+ // same range between the 2 CF and validate the result is same
9414
9421
  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
9415
9422
  dbname_ = test::PerThreadDBPath(
9416
9423
  "UserDefinedIndexStressTest_DeleteRangeMixedWithDataFile");
@@ -9418,9 +9425,9 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
9418
9425
  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
9419
9426
 
9420
9427
  // Test 3 levels.
9421
- // bottom level is normal data files.
9422
- ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
9423
- // middle level delete range between each level
9428
+ // Bottom level is mixed data with delete range.
9429
+ ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L6"));
9430
+ // Middle level delete range across entire key space.
9424
9431
  if (is_reverse_comparator_) {
9425
9432
  ranges_in_levels_.push_back({{.start = 100,
9426
9433
  .end = 0,
@@ -9437,8 +9444,8 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
9437
9444
  .end_key = "keyz"}});
9438
9445
  }
9439
9446
 
9440
- // Top level is normal data files
9441
- ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
9447
+ // Top level is mixed data with delete range.
9448
+ ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L4"));
9442
9449
 
9443
9450
  IngestExternalFileOptions ifo;
9444
9451
  ifo.snapshot_consistency = false;
@@ -9448,7 +9455,7 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
9448
9455
  for (auto const& ranges_in_level : ranges_in_levels_) {
9449
9456
  ASSERT_NO_FATAL_FAILURE(
9450
9457
  IngestFilesInOneLevel(ranges_in_level, ingest_file_name_prefix,
9451
- ingest_file_count, ifo, true));
9458
+ ingest_file_count, ifo, /*combine_ranges=*/true));
9452
9459
  if (first_level) {
9453
9460
  first_level = false;
9454
9461
  if (enable_compaction_with_sst_partitioner_) {
@@ -9475,9 +9482,10 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
9475
9482
  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
9476
9483
 
9477
9484
  // Test 3 levels.
9478
- // bottom level is normal data files.
9485
+ // bottom level constains multiple files, each could have data or delete
9486
+ // ranges or both.
9479
9487
  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
9480
- // middle level delete range between each level
9488
+ // middle level delete range across entire key space
9481
9489
  if (is_reverse_comparator_) {
9482
9490
  ranges_in_levels_.push_back({{.start = 100,
9483
9491
  .end = 0,
@@ -9493,7 +9501,8 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
9493
9501
  .start_key = "key",
9494
9502
  .end_key = "keyz"}});
9495
9503
  }
9496
- // Top level is normal data files
9504
+ // Top level constains multiple files, each could have data or delete
9505
+ // ranges or both.
9497
9506
  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
9498
9507
 
9499
9508
  IngestExternalFileOptions ifo;
@@ -9519,20 +9528,19 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
9519
9528
  }
9520
9529
 
9521
9530
  TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
9522
- // Create 2 column families. One use normal put/del, the other uses sst
9523
- // ingest.
9524
- // Test the case where there are 3 levels, the middle level is a delete
9525
- // range file that span across the entire key space. Range scan same range
9526
- // between the 2 CF and validate the result is same
9531
+ // Create 2 column families. One use normal put/del, the other uses SST
9532
+ // ingest. The SST ingest uses atomic range replace.
9527
9533
  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
9528
- dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange");
9534
+ dbname_ =
9535
+ test::PerThreadDBPath("UserDefinedIndexStressTest_AtomicReplaceBulkLoad");
9529
9536
  SCOPED_TRACE("dbname: " + dbname_);
9530
9537
  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
9531
9538
 
9532
9539
  // Test 3 levels.
9533
- // bottom level is normal data files.
9540
+ // bottom level constains multiple files, each could have data or delete
9541
+ // ranges or both.
9534
9542
  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
9535
- // middle level delete range between each level
9543
+ // middle level delete range across entire key space
9536
9544
  if (is_reverse_comparator_) {
9537
9545
  ranges_in_levels_.push_back({{.start = 100,
9538
9546
  .end = 0,
@@ -9548,7 +9556,8 @@ TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
9548
9556
  .start_key = "key",
9549
9557
  .end_key = "keyz"}});
9550
9558
  }
9551
- // Top level is normal data files
9559
+ // Top level constains multiple files, each could have data or delete
9560
+ // ranges or both.
9552
9561
  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
9553
9562
 
9554
9563
  IngestExternalFileOptions ifo;
@@ -9569,7 +9578,7 @@ TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
9569
9578
  }
9570
9579
 
9571
9580
  // Ingest the a new file with atomic replace with full key space, this layer
9572
- // is exactly same as the one at Level 4
9581
+ // is exactly same as the one at the top level
9573
9582
  bool data_added;
9574
9583
  ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
9575
9584
  ingest_file_name_prefix + std::to_string(++ingest_file_count),
@@ -79,4 +79,8 @@ void SetupSyncPointsToMockDirectIO() {
79
79
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
80
80
  #endif
81
81
  }
82
+
83
+ #ifndef NDEBUG
84
+ std::atomic<int> g_throw_on_testable_assertion_failure{0};
85
+ #endif // NDEBUG
82
86
  } // namespace ROCKSDB_NAMESPACE
@@ -180,3 +180,35 @@ void SetupSyncPointsToMockDirectIO();
180
180
  } \
181
181
  }
182
182
  #endif // NDEBUG
183
+
184
+ // An alternative to assert() that is more test-friendly than using
185
+ // ASSERT_DEATH. Relies on exception propagation.
186
+ #ifdef NDEBUG
187
+ #define testable_assert(cond)
188
+ #else
189
+ namespace ROCKSDB_NAMESPACE {
190
+ // Intentionally not based on std::exception to reduce places where this
191
+ // would be caught
192
+ struct TestableAssertionFailure {};
193
+ extern std::atomic<int> g_throw_on_testable_assertion_failure;
194
+ } // namespace ROCKSDB_NAMESPACE
195
+ #define testable_assert(cond) \
196
+ do { \
197
+ if (ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.load( \
198
+ std::memory_order_relaxed) > 0) { \
199
+ if (cond) { \
200
+ } else \
201
+ throw ROCKSDB_NAMESPACE::TestableAssertionFailure(); \
202
+ } else { \
203
+ assert(cond); \
204
+ } \
205
+ } while (0)
206
+ #define ASSERT_TESTABLE_FAILURE(expr) \
207
+ do { \
208
+ ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_add( \
209
+ 1, std::memory_order_relaxed); \
210
+ ASSERT_THROW(expr, ROCKSDB_NAMESPACE::TestableAssertionFailure); \
211
+ ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_sub( \
212
+ 1, std::memory_order_relaxed); \
213
+ } while (0)
214
+ #endif
@@ -766,6 +766,10 @@ struct CompressorCustomAlg : public CompressorWrapper {
766
766
  return kCompression;
767
767
  }
768
768
 
769
+ std::unique_ptr<Compressor> Clone() const override {
770
+ return std::make_unique<CompressorCustomAlg>(wrapped_->Clone());
771
+ }
772
+
769
773
  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
770
774
  size_t* compressed_output_size,
771
775
  CompressionType* out_compression_type,
@@ -792,9 +796,9 @@ struct CompressorCustomAlg : public CompressorWrapper {
792
796
  }
793
797
 
794
798
  std::unique_ptr<Compressor> MaybeCloneSpecialized(
795
- CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
799
+ CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override {
796
800
  auto clone =
797
- wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
801
+ wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples));
798
802
  return std::make_unique<CompressorCustomAlg>(std::move(clone));
799
803
  }
800
804
 
@@ -446,6 +446,14 @@ DEFINE_int64(db_write_buffer_size,
446
446
  ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
447
447
  "Number of bytes to buffer in all memtables before compacting");
448
448
 
449
+ DEFINE_int64(max_manifest_file_size,
450
+ ROCKSDB_NAMESPACE::Options().max_manifest_file_size,
451
+ "Max manifest file size (or minimum max with auto-tuning)");
452
+
453
+ DEFINE_int32(max_manifest_space_amp_pct,
454
+ ROCKSDB_NAMESPACE::Options().max_manifest_space_amp_pct,
455
+ "Max manifest space amp percentage for auto-tuning");
456
+
449
457
  DEFINE_bool(cost_write_buffer_to_cache, false,
450
458
  "The usage of memtable is costed to the block cache");
451
459
 
@@ -579,7 +587,7 @@ DEFINE_double(cache_high_pri_pool_ratio, 0.0,
579
587
  DEFINE_double(cache_low_pri_pool_ratio, 0.0,
580
588
  "Ratio of block cache reserve for low pri blocks.");
581
589
 
582
- DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
590
+ DEFINE_string(cache_type, "hyper_clock_cache", "Type of block cache.");
583
591
 
584
592
  DEFINE_bool(use_compressed_secondary_cache, false,
585
593
  "Use the CompressedSecondaryCache as the secondary cache.");
@@ -3252,10 +3260,10 @@ class Benchmark {
3252
3260
  db_bench_exit(1);
3253
3261
  } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) {
3254
3262
  size_t estimated_entry_charge;
3255
- if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
3256
- FLAGS_cache_type == "hyper_clock_cache") {
3263
+ if (FLAGS_cache_type == "fixed_hyper_clock_cache") {
3257
3264
  estimated_entry_charge = FLAGS_block_size;
3258
- } else if (FLAGS_cache_type == "auto_hyper_clock_cache") {
3265
+ } else if (FLAGS_cache_type == "auto_hyper_clock_cache" ||
3266
+ FLAGS_cache_type == "hyper_clock_cache") {
3259
3267
  estimated_entry_charge = 0;
3260
3268
  } else {
3261
3269
  fprintf(stderr, "Cache type not supported.");
@@ -4368,6 +4376,8 @@ class Benchmark {
4368
4376
  options.write_buffer_manager.reset(
4369
4377
  new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
4370
4378
  }
4379
+ options.max_manifest_file_size = FLAGS_max_manifest_file_size;
4380
+ options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
4371
4381
  options.arena_block_size = FLAGS_arena_block_size;
4372
4382
  options.write_buffer_size = FLAGS_write_buffer_size;
4373
4383
  options.max_write_buffer_number = FLAGS_max_write_buffer_number;
@@ -1610,11 +1610,12 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
1610
1610
  WriteController wc(options.delayed_write_rate);
1611
1611
  WriteBufferManager wb(options.db_write_buffer_size);
1612
1612
  ImmutableDBOptions immutable_db_options(options);
1613
- VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
1613
+ VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{}, sopt,
1614
+ tc.get(), &wb, &wc,
1614
1615
  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
1615
1616
  /*db_id=*/"", /*db_session_id=*/"",
1616
1617
  options.daily_offpeak_time_utc,
1617
- /*error_handler=*/nullptr, /*read_only=*/true);
1618
+ /*error_handler=*/nullptr, /*unchanging=*/true);
1618
1619
  Status s = versions.DumpManifest(options, file, verbose, hex, json, cf_descs);
1619
1620
  if (!s.ok()) {
1620
1621
  fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
@@ -1805,11 +1806,12 @@ Status GetLiveFilesChecksumInfoFromVersionSet(Options options,
1805
1806
  WriteController wc(options.delayed_write_rate);
1806
1807
  WriteBufferManager wb(options.db_write_buffer_size);
1807
1808
  ImmutableDBOptions immutable_db_options(options);
1808
- VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
1809
+ VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{options},
1810
+ sopt, tc.get(), &wb, &wc,
1809
1811
  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
1810
1812
  /*db_id=*/"", /*db_session_id=*/"",
1811
1813
  options.daily_offpeak_time_utc,
1812
- /*error_handler=*/nullptr, /*read_only=*/true);
1814
+ /*error_handler=*/nullptr, /*unchanging=*/true);
1813
1815
  std::vector<std::string> cf_name_list;
1814
1816
  s = versions.ListColumnFamilies(&cf_name_list, db_path,
1815
1817
  immutable_db_options.fs.get());
@@ -2660,7 +2662,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) {
2660
2662
  const InternalKeyComparator cmp(opt.comparator);
2661
2663
  WriteController wc(opt.delayed_write_rate);
2662
2664
  WriteBufferManager wb(opt.db_write_buffer_size);
2663
- VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
2665
+ VersionSet versions(db_path_, &db_options, MutableDBOptions{opt}, soptions,
2666
+ tc.get(), &wb, &wc,
2664
2667
  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
2665
2668
  /*db_id=*/"", /*db_session_id=*/"",
2666
2669
  opt.daily_offpeak_time_utc,
@@ -208,8 +208,9 @@ class FileChecksumTestHelper {
208
208
  WriteController wc(options_.delayed_write_rate);
209
209
  WriteBufferManager wb(options_.db_write_buffer_size);
210
210
  ImmutableDBOptions immutable_db_options(options_);
211
- VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
212
- &wc, nullptr, nullptr, "", "",
211
+ VersionSet versions(dbname_, &immutable_db_options,
212
+ MutableDBOptions{options_}, sopt, tc.get(), &wb, &wc,
213
+ nullptr, nullptr, "", "",
213
214
  options_.daily_offpeak_time_utc, nullptr,
214
215
  /*read_only=*/false);
215
216
  std::vector<std::string> cf_name_list;