@nxtedition/rocksdb 7.1.14 → 7.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. package/binding.cc +1 -0
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +72 -18
  3. package/deps/rocksdb/rocksdb/Makefile +91 -11
  4. package/deps/rocksdb/rocksdb/TARGETS +8 -4
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +5 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +13 -8
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +116 -57
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +958 -459
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +407 -622
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +104 -40
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +23 -8
  13. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +350 -184
  14. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +12 -2
  15. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +2 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +130 -43
  17. package/deps/rocksdb/rocksdb/cache/lru_cache.h +24 -2
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +423 -98
  19. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +19 -2
  20. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +10 -7
  21. package/deps/rocksdb/rocksdb/crash_test.mk +2 -2
  22. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +46 -26
  23. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +9 -3
  24. package/deps/rocksdb/rocksdb/db/blob/blob_contents.cc +90 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +56 -0
  26. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -10
  27. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +64 -59
  28. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +11 -8
  29. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +92 -62
  30. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +159 -136
  31. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +13 -13
  32. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +129 -57
  33. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +81 -3
  34. package/deps/rocksdb/rocksdb/db/c.cc +29 -0
  35. package/deps/rocksdb/rocksdb/db/column_family.cc +10 -1
  36. package/deps/rocksdb/rocksdb/db/column_family_test.cc +21 -0
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +42 -36
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +344 -102
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +163 -28
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +52 -17
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +35 -30
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -3
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +167 -11
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +8 -8
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +10 -13
  46. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +0 -117
  47. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +6 -49
  48. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +29 -4
  49. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +18 -11
  50. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +4 -10
  51. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +1 -1
  52. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +12 -0
  53. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +144 -93
  54. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +28 -32
  55. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +1 -1
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -9
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -33
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -5
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +11 -0
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -2
  61. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -0
  62. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +2 -1
  63. package/deps/rocksdb/rocksdb/db/db_iter.cc +76 -138
  64. package/deps/rocksdb/rocksdb/db/db_iter.h +26 -23
  65. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +1 -1
  66. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +931 -0
  67. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +2 -2
  68. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -0
  69. package/deps/rocksdb/rocksdb/db/db_test2.cc +44 -22
  70. package/deps/rocksdb/rocksdb/db/db_test_util.cc +6 -14
  71. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +155 -0
  72. package/deps/rocksdb/rocksdb/db/db_write_test.cc +45 -0
  73. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -1
  74. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +8 -0
  75. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -1
  76. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +24 -12
  77. package/deps/rocksdb/rocksdb/db/internal_stats.cc +7 -1
  78. package/deps/rocksdb/rocksdb/db/internal_stats.h +3 -0
  79. package/deps/rocksdb/rocksdb/db/memtable.cc +79 -18
  80. package/deps/rocksdb/rocksdb/db/memtable.h +5 -0
  81. package/deps/rocksdb/rocksdb/db/memtable_list.cc +26 -4
  82. package/deps/rocksdb/rocksdb/db/memtable_list.h +2 -1
  83. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +113 -0
  84. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +110 -0
  85. package/deps/rocksdb/rocksdb/db/{periodic_work_scheduler_test.cc → periodic_task_scheduler_test.cc} +33 -39
  86. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +12 -20
  87. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +6 -5
  88. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +12 -8
  89. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +20 -5
  90. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +14 -0
  91. package/deps/rocksdb/rocksdb/db/repair.cc +17 -8
  92. package/deps/rocksdb/rocksdb/db/repair_test.cc +2 -1
  93. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +49 -66
  94. package/deps/rocksdb/rocksdb/db/table_cache.cc +92 -63
  95. package/deps/rocksdb/rocksdb/db/table_cache.h +16 -9
  96. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  97. package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +2 -2
  98. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -3
  99. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  100. package/deps/rocksdb/rocksdb/db/version_builder.cc +1 -1
  101. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -2
  102. package/deps/rocksdb/rocksdb/db/version_set.cc +379 -145
  103. package/deps/rocksdb/rocksdb/db/version_set.h +26 -24
  104. package/deps/rocksdb/rocksdb/db/version_set_test.cc +9 -9
  105. package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
  106. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +10 -2
  107. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +2 -0
  108. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +5 -8
  109. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +5 -8
  110. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +2 -0
  111. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +71 -0
  112. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +14 -0
  113. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +23 -0
  114. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +26 -1
  115. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +105 -34
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +16 -8
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -0
  118. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +4 -8
  119. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -8
  120. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +282 -25
  121. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  122. package/deps/rocksdb/rocksdb/env/io_posix.cc +3 -1
  123. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +367 -177
  124. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +144 -56
  125. package/deps/rocksdb/rocksdb/file/filename.cc +3 -3
  126. package/deps/rocksdb/rocksdb/file/filename.h +4 -2
  127. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +415 -0
  128. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +2 -0
  129. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +36 -45
  130. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +21 -3
  131. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +11 -11
  132. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +15 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +163 -68
  134. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +26 -12
  135. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +23 -5
  136. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +21 -17
  137. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +17 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +3 -3
  139. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +17 -6
  140. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -0
  142. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +3 -3
  143. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +4 -0
  144. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  145. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +3 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -1
  147. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +2 -1
  148. package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -2
  149. package/deps/rocksdb/rocksdb/monitoring/histogram.cc +4 -2
  150. package/deps/rocksdb/rocksdb/monitoring/histogram.h +2 -0
  151. package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +15 -1
  152. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +17 -0
  153. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +14 -3
  154. package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +3 -0
  155. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +50 -0
  156. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +1 -0
  157. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +31 -32
  158. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -1
  159. package/deps/rocksdb/rocksdb/options/options.cc +2 -2
  160. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -1
  161. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -0
  162. package/deps/rocksdb/rocksdb/src.mk +4 -2
  163. package/deps/rocksdb/rocksdb/table/block_based/block.h +9 -8
  164. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +110 -99
  165. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +12 -10
  166. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +11 -2
  167. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +138 -83
  168. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +25 -24
  169. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +31 -30
  170. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -13
  171. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +4 -4
  172. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -3
  173. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +3 -3
  174. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +17 -19
  175. package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
  176. package/deps/rocksdb/rocksdb/table/format.cc +26 -29
  177. package/deps/rocksdb/rocksdb/table/format.h +44 -26
  178. package/deps/rocksdb/rocksdb/table/get_context.cc +17 -12
  179. package/deps/rocksdb/rocksdb/table/internal_iterator.h +7 -0
  180. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +4 -0
  181. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +950 -104
  182. package/deps/rocksdb/rocksdb/table/merging_iterator.h +28 -1
  183. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +3 -2
  184. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -1
  185. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +10 -9
  186. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +22 -20
  187. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +1 -1
  188. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +1 -1
  189. package/deps/rocksdb/rocksdb/table/table_builder.h +9 -21
  190. package/deps/rocksdb/rocksdb/table/table_test.cc +12 -12
  191. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +4 -4
  192. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +1 -0
  193. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +116 -34
  194. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +6 -1
  195. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +1 -1
  196. package/deps/rocksdb/rocksdb/util/autovector.h +12 -0
  197. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +3 -2
  198. package/deps/rocksdb/rocksdb/util/stderr_logger.cc +30 -0
  199. package/deps/rocksdb/rocksdb/util/stderr_logger.h +5 -18
  200. package/deps/rocksdb/rocksdb/util/timer.h +2 -3
  201. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +9 -2
  202. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +1 -1
  203. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +1 -1
  204. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +34 -53
  205. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +9 -14
  206. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -4
  207. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +4 -0
  208. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +1 -1
  209. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +4 -3
  210. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +3 -1
  211. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +26 -8
  212. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +114 -16
  213. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
  214. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +59 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +3 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +39 -0
  217. package/deps/rocksdb/rocksdb.gyp +0 -1
  218. package/index.js +6 -10
  219. package/package.json +1 -1
  220. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  221. package/prebuilds/linux-x64/node.napi.node +0 -0
  222. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +0 -168
  223. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +0 -90
@@ -511,6 +511,7 @@ class FilePickerMultiGet {
511
511
  MultiGetRange& GetRange() { return range_; }
512
512
 
513
513
  void ReplaceRange(const MultiGetRange& other) {
514
+ assert(hit_file_ == nullptr);
514
515
  range_ = other;
515
516
  current_level_range_ = other;
516
517
  }
@@ -940,17 +941,18 @@ namespace {
940
941
  class LevelIterator final : public InternalIterator {
941
942
  public:
942
943
  // @param read_options Must outlive this iterator.
943
- LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
944
- const FileOptions& file_options,
945
- const InternalKeyComparator& icomparator,
946
- const LevelFilesBrief* flevel,
947
- const std::shared_ptr<const SliceTransform>& prefix_extractor,
948
- bool should_sample, HistogramImpl* file_read_hist,
949
- TableReaderCaller caller, bool skip_filters, int level,
950
- RangeDelAggregator* range_del_agg,
951
- const std::vector<AtomicCompactionUnitBoundary>*
952
- compaction_boundaries = nullptr,
953
- bool allow_unprepared_value = false)
944
+ LevelIterator(
945
+ TableCache* table_cache, const ReadOptions& read_options,
946
+ const FileOptions& file_options, const InternalKeyComparator& icomparator,
947
+ const LevelFilesBrief* flevel,
948
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
949
+ bool should_sample, HistogramImpl* file_read_hist,
950
+ TableReaderCaller caller, bool skip_filters, int level,
951
+ RangeDelAggregator* range_del_agg,
952
+ const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
953
+ nullptr,
954
+ bool allow_unprepared_value = false,
955
+ TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr)
954
956
  : table_cache_(table_cache),
955
957
  read_options_(read_options),
956
958
  file_options_(file_options),
@@ -968,13 +970,23 @@ class LevelIterator final : public InternalIterator {
968
970
  range_del_agg_(range_del_agg),
969
971
  pinned_iters_mgr_(nullptr),
970
972
  compaction_boundaries_(compaction_boundaries),
971
- is_next_read_sequential_(false) {
973
+ is_next_read_sequential_(false),
974
+ range_tombstone_iter_(nullptr),
975
+ to_return_sentinel_(false) {
972
976
  // Empty level is not supported.
973
977
  assert(flevel_ != nullptr && flevel_->num_files > 0);
978
+ if (range_tombstone_iter_ptr_) {
979
+ *range_tombstone_iter_ptr_ = &range_tombstone_iter_;
980
+ }
974
981
  }
975
982
 
976
983
  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
977
984
 
985
+ // Seek to the first file with a key >= target.
986
+ // If range_tombstone_iter_ is not nullptr, then we pretend that file
987
+ // boundaries are fake keys (sentinel keys). These keys are used to keep range
988
+ // tombstones alive even when all point keys in an SST file are exhausted.
989
+ // These sentinel keys will be skipped in merging iterator.
978
990
  void Seek(const Slice& target) override;
979
991
  void SeekForPrev(const Slice& target) override;
980
992
  void SeekToFirst() override;
@@ -983,14 +995,29 @@ class LevelIterator final : public InternalIterator {
983
995
  bool NextAndGetResult(IterateResult* result) override;
984
996
  void Prev() override;
985
997
 
986
- bool Valid() const override { return file_iter_.Valid(); }
998
+ // In addition to valid and invalid state (!file_iter.Valid() and
999
+ // status.ok()), a third state of the iterator is when !file_iter_.Valid() and
1000
+ // to_return_sentinel_. This means we are at the end of a file, and a sentinel
1001
+ // key (the file boundary that we pretend as a key) is to be returned next.
1002
+ // file_iter_.Valid() and to_return_sentinel_ should not both be true.
1003
+ bool Valid() const override {
1004
+ assert(!(file_iter_.Valid() && to_return_sentinel_));
1005
+ return file_iter_.Valid() || to_return_sentinel_;
1006
+ }
987
1007
  Slice key() const override {
988
1008
  assert(Valid());
1009
+ if (to_return_sentinel_) {
1010
+ // Sentinel should be returned after file_iter_ reaches the end of the
1011
+ // file
1012
+ assert(!file_iter_.Valid());
1013
+ return sentinel_;
1014
+ }
989
1015
  return file_iter_.key();
990
1016
  }
991
1017
 
992
1018
  Slice value() const override {
993
1019
  assert(Valid());
1020
+ assert(!to_return_sentinel_);
994
1021
  return file_iter_.value();
995
1022
  }
996
1023
 
@@ -1032,6 +1059,8 @@ class LevelIterator final : public InternalIterator {
1032
1059
  file_iter_.iter() && file_iter_.IsValuePinned();
1033
1060
  }
1034
1061
 
1062
+ bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; }
1063
+
1035
1064
  private:
1036
1065
  // Return true if at least one invalid file is seen and skipped.
1037
1066
  bool SkipEmptyFileForward();
@@ -1044,6 +1073,11 @@ class LevelIterator final : public InternalIterator {
1044
1073
  return flevel_->files[file_index].smallest_key;
1045
1074
  }
1046
1075
 
1076
+ const Slice& file_largest_key(size_t file_index) {
1077
+ assert(file_index < flevel_->num_files);
1078
+ return flevel_->files[file_index].largest_key;
1079
+ }
1080
+
1047
1081
  bool KeyReachedUpperBound(const Slice& internal_key) {
1048
1082
  return read_options_.iterate_upper_bound != nullptr &&
1049
1083
  user_comparator_.CompareWithoutTimestamp(
@@ -1051,6 +1085,16 @@ class LevelIterator final : public InternalIterator {
1051
1085
  *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0;
1052
1086
  }
1053
1087
 
1088
+ void ClearRangeTombstoneIter() {
1089
+ if (range_tombstone_iter_ && *range_tombstone_iter_) {
1090
+ delete *range_tombstone_iter_;
1091
+ *range_tombstone_iter_ = nullptr;
1092
+ }
1093
+ }
1094
+
1095
+ // Move file_iter_ to the file at file_index_.
1096
+ // range_tombstone_iter_ is updated with a range tombstone iterator
1097
+ // into the new file. Old range tombstone iterator is cleared.
1054
1098
  InternalIterator* NewFileIterator() {
1055
1099
  assert(file_index_ < flevel_->num_files);
1056
1100
  auto file_meta = flevel_->files[file_index_];
@@ -1065,13 +1109,14 @@ class LevelIterator final : public InternalIterator {
1065
1109
  largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
1066
1110
  }
1067
1111
  CheckMayBeOutOfLowerBound();
1112
+ ClearRangeTombstoneIter();
1068
1113
  return table_cache_->NewIterator(
1069
1114
  read_options_, file_options_, icomparator_, *file_meta.file_metadata,
1070
1115
  range_del_agg_, prefix_extractor_,
1071
1116
  nullptr /* don't need reference to table */, file_read_hist_, caller_,
1072
1117
  /*arena=*/nullptr, skip_filters_, level_,
1073
1118
  /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
1074
- largest_compaction_key, allow_unprepared_value_);
1119
+ largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
1075
1120
  }
1076
1121
 
1077
1122
  // Check if current file being fully within iterate_lower_bound.
@@ -1117,9 +1162,51 @@ class LevelIterator final : public InternalIterator {
1117
1162
  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
1118
1163
 
1119
1164
  bool is_next_read_sequential_;
1165
+
1166
+ // This is set when this level iterator is used under a merging iterator
1167
+ // that processes range tombstones. range_tombstone_iter_ points to where the
1168
+ // merging iterator stores the range tombstones iterator for this level. When
1169
+ // this level iterator moves to a new SST file, it updates the range
1170
+ // tombstones accordingly through this pointer. So the merging iterator always
1171
+ // has access to the current SST file's range tombstones.
1172
+ //
1173
+ // The level iterator treats file boundary as fake keys (sentinel keys) to
1174
+ // keep range tombstones alive if needed and make upper level, i.e. merging
1175
+ // iterator, aware of file changes (when level iterator moves to a new SST
1176
+ // file, there is some bookkeeping work that needs to be done at merging
1177
+ // iterator end).
1178
+ //
1179
+ // *range_tombstone_iter_ points to range tombstones of the current SST file
1180
+ TruncatedRangeDelIterator** range_tombstone_iter_;
1181
+
1182
+ // Whether next/prev key is a sentinel key.
1183
+ bool to_return_sentinel_ = false;
1184
+ // The sentinel key to be returned
1185
+ Slice sentinel_;
1186
+ // Sets flags for if we should return the sentinel key next.
1187
+ // The condition for returning sentinel is reaching the end of current
1188
+ // file_iter_: !Valid() && status.().ok().
1189
+ void TrySetDeleteRangeSentinel(const Slice& boundary_key);
1190
+ void ClearSentinel() { to_return_sentinel_ = false; }
1191
+
1192
+ // Set in Seek() when a prefix seek reaches end of the current file,
1193
+ // and the next file has a different prefix. SkipEmptyFileForward()
1194
+ // will not move to next file when this flag is set.
1195
+ bool prefix_exhausted_ = false;
1120
1196
  };
1121
1197
 
1198
+ void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) {
1199
+ assert(range_tombstone_iter_);
1200
+ if (file_iter_.iter() != nullptr && !file_iter_.Valid() &&
1201
+ file_iter_.status().ok()) {
1202
+ to_return_sentinel_ = true;
1203
+ sentinel_ = boundary_key;
1204
+ }
1205
+ }
1206
+
1122
1207
  void LevelIterator::Seek(const Slice& target) {
1208
+ prefix_exhausted_ = false;
1209
+ ClearSentinel();
1123
1210
  // Check whether the seek key fall under the same file
1124
1211
  bool need_to_reseek = true;
1125
1212
  if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
@@ -1148,44 +1235,82 @@ void LevelIterator::Seek(const Slice& target) {
1148
1235
  if (file_iter_.status() == Status::TryAgain()) {
1149
1236
  return;
1150
1237
  }
1151
- }
1152
-
1153
- if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
1154
- !read_options_.total_order_seek && !read_options_.auto_prefix_mode &&
1155
- file_iter_.iter() != nullptr && file_iter_.Valid()) {
1156
- // We've skipped the file we initially positioned to. In the prefix
1157
- // seek case, it is likely that the file is skipped because of
1158
- // prefix bloom or hash, where more keys are skipped. We then check
1159
- // the current key and invalidate the iterator if the prefix is
1160
- // already passed.
1161
- // When doing prefix iterator seek, when keys for one prefix have
1162
- // been exhausted, it can jump to any key that is larger. Here we are
1163
- // enforcing a stricter contract than that, in order to make it easier for
1164
- // higher layers (merging and DB iterator) to reason the correctness:
1165
- // 1. Within the prefix, the result should be accurate.
1166
- // 2. If keys for the prefix is exhausted, it is either positioned to the
1167
- // next key after the prefix, or make the iterator invalid.
1168
- // A side benefit will be that it invalidates the iterator earlier so that
1169
- // the upper level merging iterator can merge fewer child iterators.
1170
- size_t ts_sz = user_comparator_.timestamp_size();
1171
- Slice target_user_key_without_ts =
1172
- ExtractUserKeyAndStripTimestamp(target, ts_sz);
1173
- Slice file_user_key_without_ts =
1174
- ExtractUserKeyAndStripTimestamp(file_iter_.key(), ts_sz);
1175
- if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
1176
- (!prefix_extractor_->InDomain(file_user_key_without_ts) ||
1177
- user_comparator_.CompareWithoutTimestamp(
1178
- prefix_extractor_->Transform(target_user_key_without_ts), false,
1179
- prefix_extractor_->Transform(file_user_key_without_ts),
1180
- false) != 0)) {
1181
- SetFileIterator(nullptr);
1238
+ if (!file_iter_.Valid() && file_iter_.status().ok() &&
1239
+ prefix_extractor_ != nullptr && !read_options_.total_order_seek &&
1240
+ !read_options_.auto_prefix_mode &&
1241
+ file_index_ < flevel_->num_files - 1) {
1242
+ size_t ts_sz = user_comparator_.timestamp_size();
1243
+ Slice target_user_key_without_ts =
1244
+ ExtractUserKeyAndStripTimestamp(target, ts_sz);
1245
+ Slice next_file_first_user_key_without_ts =
1246
+ ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1),
1247
+ ts_sz);
1248
+ if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
1249
+ (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) ||
1250
+ user_comparator_.CompareWithoutTimestamp(
1251
+ prefix_extractor_->Transform(target_user_key_without_ts), false,
1252
+ prefix_extractor_->Transform(
1253
+ next_file_first_user_key_without_ts),
1254
+ false) != 0)) {
1255
+ // SkipEmptyFileForward() will not advance to next file when this flag
1256
+ // is set for reason detailed below.
1257
+ //
1258
+ // The file we initially positioned to has no keys under the target
1259
+ // prefix, and the next file's smallest key has a different prefix than
1260
+ // target. When doing prefix iterator seek, when keys for one prefix
1261
+ // have been exhausted, it can jump to any key that is larger. Here we
1262
+ // are enforcing a stricter contract than that, in order to make it
1263
+ // easier for higher layers (merging and DB iterator) to reason the
1264
+ // correctness:
1265
+ // 1. Within the prefix, the result should be accurate.
1266
+ // 2. If keys for the prefix is exhausted, it is either positioned to
1267
+ // the next key after the prefix, or make the iterator invalid.
1268
+ // A side benefit will be that it invalidates the iterator earlier so
1269
+ // that the upper level merging iterator can merge fewer child
1270
+ // iterators.
1271
+ //
1272
+ // The flag is cleared in Seek*() calls. There is no need to clear the
1273
+ // flag in Prev() since Prev() will not be called when the flag is set
1274
+ // for reasons explained below. If range_tombstone_iter_ is nullptr,
1275
+ // then there is no file boundary sentinel key. Since
1276
+ // !file_iter_.Valid() from the if condition above, this level iterator
1277
+ // is !Valid(), so Prev() will not be called. If range_tombstone_iter_
1278
+ // is not nullptr, there are two cases depending on if this level
1279
+ // iterator reaches top of the heap in merging iterator (the upper
1280
+ // layer).
1281
+ // If so, merging iterator will see the sentinel key, call
1282
+ // NextAndGetResult() and the call to NextAndGetResult() will skip the
1283
+ // sentinel key and makes this level iterator invalid. If not, then it
1284
+ // could be because the upper layer is done before any method of this
1285
+ // level iterator is called or another Seek*() call is invoked. Either
1286
+ // way, Prev() is never called before Seek*().
1287
+ // The flag should not be cleared at the beginning of
1288
+ // Next/NextAndGetResult() since it is used in SkipEmptyFileForward()
1289
+ // called in Next/NextAndGetResult().
1290
+ prefix_exhausted_ = true;
1291
+ }
1292
+ }
1293
+
1294
+ if (range_tombstone_iter_) {
1295
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
1182
1296
  }
1183
1297
  }
1298
+ SkipEmptyFileForward();
1184
1299
  CheckMayBeOutOfLowerBound();
1185
1300
  }
1186
1301
 
1187
1302
  void LevelIterator::SeekForPrev(const Slice& target) {
1303
+ prefix_exhausted_ = false;
1304
+ ClearSentinel();
1188
1305
  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
1306
+ // Seek beyond this level's smallest key
1307
+ if (new_file_index == 0 &&
1308
+ icomparator_.Compare(target, file_smallest_key(0)) < 0) {
1309
+ SetFileIterator(nullptr);
1310
+ ClearRangeTombstoneIter();
1311
+ CheckMayBeOutOfLowerBound();
1312
+ return;
1313
+ }
1189
1314
  if (new_file_index >= flevel_->num_files) {
1190
1315
  new_file_index = flevel_->num_files - 1;
1191
1316
  }
@@ -1193,24 +1318,47 @@ void LevelIterator::SeekForPrev(const Slice& target) {
1193
1318
  InitFileIterator(new_file_index);
1194
1319
  if (file_iter_.iter() != nullptr) {
1195
1320
  file_iter_.SeekForPrev(target);
1321
+ if (range_tombstone_iter_ &&
1322
+ icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) {
1323
+ // In SeekForPrev() case, it is possible that the target is less than
1324
+ // file's lower boundary since largest key is used to determine file index
1325
+ // (FindFile()). When target is less than file's lower boundary, sentinel
1326
+ // key should not be set so that SeekForPrev() does not result in a key
1327
+ // larger than target. This is correct in that there is no need to keep
1328
+ // the range tombstones in this file alive as they only cover keys
1329
+ // starting from the file's lower boundary, which is after `target`.
1330
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
1331
+ }
1196
1332
  SkipEmptyFileBackward();
1197
1333
  }
1198
1334
  CheckMayBeOutOfLowerBound();
1199
1335
  }
1200
1336
 
1201
1337
  void LevelIterator::SeekToFirst() {
1338
+ prefix_exhausted_ = false;
1339
+ ClearSentinel();
1202
1340
  InitFileIterator(0);
1203
1341
  if (file_iter_.iter() != nullptr) {
1204
1342
  file_iter_.SeekToFirst();
1343
+ if (range_tombstone_iter_) {
1344
+ // We do this in SeekToFirst() and SeekToLast() since
1345
+ // we could have an empty file with only range tombstones.
1346
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
1347
+ }
1205
1348
  }
1206
1349
  SkipEmptyFileForward();
1207
1350
  CheckMayBeOutOfLowerBound();
1208
1351
  }
1209
1352
 
1210
1353
  void LevelIterator::SeekToLast() {
1354
+ prefix_exhausted_ = false;
1355
+ ClearSentinel();
1211
1356
  InitFileIterator(flevel_->num_files - 1);
1212
1357
  if (file_iter_.iter() != nullptr) {
1213
1358
  file_iter_.SeekToLast();
1359
+ if (range_tombstone_iter_) {
1360
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
1361
+ }
1214
1362
  }
1215
1363
  SkipEmptyFileBackward();
1216
1364
  CheckMayBeOutOfLowerBound();
@@ -1218,25 +1366,47 @@ void LevelIterator::SeekToLast() {
1218
1366
 
1219
1367
  void LevelIterator::Next() {
1220
1368
  assert(Valid());
1221
- file_iter_.Next();
1369
+ if (to_return_sentinel_) {
1370
+ // file_iter_ is at EOF already when to_return_sentinel_
1371
+ ClearSentinel();
1372
+ } else {
1373
+ file_iter_.Next();
1374
+ if (range_tombstone_iter_) {
1375
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
1376
+ }
1377
+ }
1222
1378
  SkipEmptyFileForward();
1223
1379
  }
1224
1380
 
1225
1381
  bool LevelIterator::NextAndGetResult(IterateResult* result) {
1226
1382
  assert(Valid());
1227
- bool is_valid = file_iter_.NextAndGetResult(result);
1383
+ // file_iter_ is at EOF already when to_return_sentinel_
1384
+ bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result);
1228
1385
  if (!is_valid) {
1386
+ if (to_return_sentinel_) {
1387
+ ClearSentinel();
1388
+ } else if (range_tombstone_iter_) {
1389
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
1390
+ }
1229
1391
  is_next_read_sequential_ = true;
1230
1392
  SkipEmptyFileForward();
1231
1393
  is_next_read_sequential_ = false;
1232
1394
  is_valid = Valid();
1233
1395
  if (is_valid) {
1234
- result->key = key();
1235
- result->bound_check_result = file_iter_.UpperBoundCheckResult();
1236
- // Ideally, we should return the real file_iter_.value_prepared but the
1237
- // information is not here. It would casue an extra PrepareValue()
1238
- // for the first key of a file.
1239
- result->value_prepared = !allow_unprepared_value_;
1396
+ // This could be set in TrySetDeleteRangeSentinel() or
1397
+ // SkipEmptyFileForward() above.
1398
+ if (to_return_sentinel_) {
1399
+ result->key = sentinel_;
1400
+ result->bound_check_result = IterBoundCheck::kUnknown;
1401
+ result->value_prepared = true;
1402
+ } else {
1403
+ result->key = key();
1404
+ result->bound_check_result = file_iter_.UpperBoundCheckResult();
1405
+ // Ideally, we should return the real file_iter_.value_prepared but the
1406
+ // information is not here. It would casue an extra PrepareValue()
1407
+ // for the first key of a file.
1408
+ result->value_prepared = !allow_unprepared_value_;
1409
+ }
1240
1410
  }
1241
1411
  }
1242
1412
  return is_valid;
@@ -1244,47 +1414,81 @@ bool LevelIterator::NextAndGetResult(IterateResult* result) {
1244
1414
 
1245
1415
  void LevelIterator::Prev() {
1246
1416
  assert(Valid());
1247
- file_iter_.Prev();
1417
+ if (to_return_sentinel_) {
1418
+ ClearSentinel();
1419
+ } else {
1420
+ file_iter_.Prev();
1421
+ if (range_tombstone_iter_) {
1422
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
1423
+ }
1424
+ }
1248
1425
  SkipEmptyFileBackward();
1249
1426
  }
1250
1427
 
1251
1428
  bool LevelIterator::SkipEmptyFileForward() {
1252
1429
  bool seen_empty_file = false;
1253
- while (file_iter_.iter() == nullptr ||
1254
- (!file_iter_.Valid() && file_iter_.status().ok() &&
1255
- file_iter_.iter()->UpperBoundCheckResult() !=
1256
- IterBoundCheck::kOutOfBound)) {
1430
+ // Pause at sentinel key
1431
+ while (!to_return_sentinel_ &&
1432
+ (file_iter_.iter() == nullptr ||
1433
+ (!file_iter_.Valid() && file_iter_.status().ok() &&
1434
+ file_iter_.iter()->UpperBoundCheckResult() !=
1435
+ IterBoundCheck::kOutOfBound))) {
1257
1436
  seen_empty_file = true;
1258
1437
  // Move to next file
1259
- if (file_index_ >= flevel_->num_files - 1) {
1260
- // Already at the last file
1261
- SetFileIterator(nullptr);
1262
- break;
1263
- }
1264
- if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) {
1438
+ if (file_index_ >= flevel_->num_files - 1 ||
1439
+ KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) ||
1440
+ prefix_exhausted_) {
1265
1441
  SetFileIterator(nullptr);
1442
+ ClearRangeTombstoneIter();
1266
1443
  break;
1267
1444
  }
1445
+ // may init a new *range_tombstone_iter
1268
1446
  InitFileIterator(file_index_ + 1);
1447
+ // We moved to a new SST file
1448
+ // Seek range_tombstone_iter_ to reset its !Valid() default state.
1449
+ // We do not need to call range_tombstone_iter_.Seek* in
1450
+ // LevelIterator::Seek* since when the merging iterator calls
1451
+ // LevelIterator::Seek*, it should also call Seek* into the corresponding
1452
+ // range tombstone iterator.
1269
1453
  if (file_iter_.iter() != nullptr) {
1270
1454
  file_iter_.SeekToFirst();
1455
+ if (range_tombstone_iter_) {
1456
+ if (*range_tombstone_iter_) {
1457
+ (*range_tombstone_iter_)->SeekToFirst();
1458
+ }
1459
+ TrySetDeleteRangeSentinel(file_largest_key(file_index_));
1460
+ }
1271
1461
  }
1272
1462
  }
1273
1463
  return seen_empty_file;
1274
1464
  }
1275
1465
 
1276
1466
  void LevelIterator::SkipEmptyFileBackward() {
1277
- while (file_iter_.iter() == nullptr ||
1278
- (!file_iter_.Valid() && file_iter_.status().ok())) {
1467
+ // Pause at sentinel key
1468
+ while (!to_return_sentinel_ &&
1469
+ (file_iter_.iter() == nullptr ||
1470
+ (!file_iter_.Valid() && file_iter_.status().ok()))) {
1279
1471
  // Move to previous file
1280
1472
  if (file_index_ == 0) {
1281
1473
  // Already the first file
1282
1474
  SetFileIterator(nullptr);
1475
+ ClearRangeTombstoneIter();
1283
1476
  return;
1284
1477
  }
1285
1478
  InitFileIterator(file_index_ - 1);
1479
+ // We moved to a new SST file
1480
+ // Seek range_tombstone_iter_ to reset its !Valid() default state.
1286
1481
  if (file_iter_.iter() != nullptr) {
1287
1482
  file_iter_.SeekToLast();
1483
+ if (range_tombstone_iter_) {
1484
+ if (*range_tombstone_iter_) {
1485
+ (*range_tombstone_iter_)->SeekToLast();
1486
+ }
1487
+ TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
1488
+ if (to_return_sentinel_) {
1489
+ break;
1490
+ }
1491
+ }
1288
1492
  }
1289
1493
  }
1290
1494
  }
@@ -1312,6 +1516,7 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
1312
1516
  if (new_file_index >= flevel_->num_files) {
1313
1517
  file_index_ = new_file_index;
1314
1518
  SetFileIterator(nullptr);
1519
+ ClearRangeTombstoneIter();
1315
1520
  return;
1316
1521
  } else {
1317
1522
  // If the file iterator shows incomplete, we try it again if users seek
@@ -1337,7 +1542,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
1337
1542
  auto table_cache = cfd_->table_cache();
1338
1543
  auto ioptions = cfd_->ioptions();
1339
1544
  Status s = table_cache->GetTableProperties(
1340
- file_options_, cfd_->internal_comparator(), file_meta->fd, tp,
1545
+ file_options_, cfd_->internal_comparator(), *file_meta, tp,
1341
1546
  mutable_cf_options_.prefix_extractor, true /* no io */);
1342
1547
  if (s.ok()) {
1343
1548
  return s;
@@ -1530,7 +1735,8 @@ size_t Version::GetMemoryUsageByTableReaders() {
1530
1735
  for (auto& file_level : storage_info_.level_files_brief_) {
1531
1736
  for (size_t i = 0; i < file_level.num_files; i++) {
1532
1737
  total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
1533
- file_options_, cfd_->internal_comparator(), file_level.files[i].fd,
1738
+ file_options_, cfd_->internal_comparator(),
1739
+ *file_level.files[i].file_metadata,
1534
1740
  mutable_cf_options_.prefix_extractor);
1535
1741
  }
1536
1742
  }
@@ -1627,38 +1833,27 @@ void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
1627
1833
  *creation_time = oldest_time;
1628
1834
  }
1629
1835
 
1630
- Status Version::VerifySstUniqueIds() const {
1631
- for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
1632
- for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
1633
- if (meta->unique_id != kNullUniqueId64x2) {
1634
- std::shared_ptr<const TableProperties> props;
1635
- Status s =
1636
- GetTableProperties(&props, meta); // may open the file if it's not
1637
- if (!s.ok()) {
1638
- return s;
1639
- }
1640
- UniqueId64x2 id;
1641
- s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
1642
- props->orig_file_number, &id);
1643
- if (!s.ok() || id != meta->unique_id) {
1644
- std::ostringstream oss;
1645
- oss << "SST #" << meta->fd.GetNumber() << " unique ID mismatch. ";
1646
- oss << "Manifest: "
1647
- << InternalUniqueIdToHumanString(&(meta->unique_id)) << ", ";
1648
- if (s.ok()) {
1649
- oss << "Table Properties: " << InternalUniqueIdToHumanString(&id);
1650
- } else {
1651
- oss << "Failed to get Table Properties: " << s.ToString();
1652
- }
1653
- return Status::Corruption("VersionSet", oss.str());
1654
- }
1655
- TEST_SYNC_POINT_CALLBACK("Version::VerifySstUniqueIds::Passed", &id);
1656
- } else {
1657
- TEST_SYNC_POINT_CALLBACK("Version::VerifySstUniqueIds::Skipped", meta);
1658
- }
1659
- }
1836
+ InternalIterator* Version::TEST_GetLevelIterator(
1837
+ const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
1838
+ int level, bool allow_unprepared_value) {
1839
+ auto* arena = merge_iter_builder->GetArena();
1840
+ auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
1841
+ TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
1842
+ auto level_iter = new (mem) LevelIterator(
1843
+ cfd_->table_cache(), read_options, file_options_,
1844
+ cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
1845
+ mutable_cf_options_.prefix_extractor, should_sample_file_read(),
1846
+ cfd_->internal_stats()->GetFileReadHist(level),
1847
+ TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
1848
+ nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
1849
+ allow_unprepared_value, &tombstone_iter_ptr);
1850
+ if (read_options.ignore_range_deletions) {
1851
+ merge_iter_builder->AddIterator(level_iter);
1852
+ } else {
1853
+ merge_iter_builder->AddPointAndTombstoneIterator(
1854
+ level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
1660
1855
  }
1661
- return Status::OK();
1856
+ return level_iter;
1662
1857
  }
1663
1858
 
1664
1859
  uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
@@ -1711,22 +1906,19 @@ double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
1711
1906
  void Version::AddIterators(const ReadOptions& read_options,
1712
1907
  const FileOptions& soptions,
1713
1908
  MergeIteratorBuilder* merge_iter_builder,
1714
- RangeDelAggregator* range_del_agg,
1715
1909
  bool allow_unprepared_value) {
1716
1910
  assert(storage_info_.finalized_);
1717
1911
 
1718
1912
  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
1719
1913
  AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
1720
- range_del_agg, allow_unprepared_value);
1914
+ allow_unprepared_value);
1721
1915
  }
1722
1916
  }
1723
1917
 
1724
1918
  void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1725
1919
  const FileOptions& soptions,
1726
1920
  MergeIteratorBuilder* merge_iter_builder,
1727
- int level,
1728
- RangeDelAggregator* range_del_agg,
1729
- bool allow_unprepared_value) {
1921
+ int level, bool allow_unprepared_value) {
1730
1922
  assert(storage_info_.finalized_);
1731
1923
  if (level >= storage_info_.num_non_empty_levels()) {
1732
1924
  // This is an empty level
@@ -1741,17 +1933,25 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1741
1933
  auto* arena = merge_iter_builder->GetArena();
1742
1934
  if (level == 0) {
1743
1935
  // Merge all level zero files together since they may overlap
1936
+ TruncatedRangeDelIterator* tombstone_iter = nullptr;
1744
1937
  for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
1745
1938
  const auto& file = storage_info_.LevelFilesBrief(0).files[i];
1746
- merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
1939
+ auto table_iter = cfd_->table_cache()->NewIterator(
1747
1940
  read_options, soptions, cfd_->internal_comparator(),
1748
- *file.file_metadata, range_del_agg,
1941
+ *file.file_metadata, /*range_del_agg=*/nullptr,
1749
1942
  mutable_cf_options_.prefix_extractor, nullptr,
1750
1943
  cfd_->internal_stats()->GetFileReadHist(0),
1751
1944
  TableReaderCaller::kUserIterator, arena,
1752
1945
  /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
1753
1946
  /*smallest_compaction_key=*/nullptr,
1754
- /*largest_compaction_key=*/nullptr, allow_unprepared_value));
1947
+ /*largest_compaction_key=*/nullptr, allow_unprepared_value,
1948
+ &tombstone_iter);
1949
+ if (read_options.ignore_range_deletions) {
1950
+ merge_iter_builder->AddIterator(table_iter);
1951
+ } else {
1952
+ merge_iter_builder->AddPointAndTombstoneIterator(table_iter,
1953
+ tombstone_iter);
1954
+ }
1755
1955
  }
1756
1956
  if (should_sample) {
1757
1957
  // Count ones for every L0 files. This is done per iterator creation
@@ -1767,14 +1967,21 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1767
1967
  // walks through the non-overlapping files in the level, opening them
1768
1968
  // lazily.
1769
1969
  auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
1770
- merge_iter_builder->AddIterator(new (mem) LevelIterator(
1970
+ TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
1971
+ auto level_iter = new (mem) LevelIterator(
1771
1972
  cfd_->table_cache(), read_options, soptions,
1772
1973
  cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
1773
1974
  mutable_cf_options_.prefix_extractor, should_sample_file_read(),
1774
1975
  cfd_->internal_stats()->GetFileReadHist(level),
1775
1976
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
1776
- range_del_agg,
1777
- /*compaction_boundaries=*/nullptr, allow_unprepared_value));
1977
+ /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
1978
+ allow_unprepared_value, &tombstone_iter_ptr);
1979
+ if (read_options.ignore_range_deletions) {
1980
+ merge_iter_builder->AddIterator(level_iter);
1981
+ } else {
1982
+ merge_iter_builder->AddPointAndTombstoneIterator(
1983
+ level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
1984
+ }
1778
1985
  }
1779
1986
  }
1780
1987
 
@@ -2465,8 +2672,8 @@ Status Version::ProcessBatch(
2465
2672
  std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
2466
2673
  autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
2467
2674
  std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
2468
- uint64_t& num_filter_read, uint64_t& num_index_read,
2469
- uint64_t& num_sst_read) {
2675
+ std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
2676
+ mget_stats) {
2470
2677
  FilePickerMultiGet& fp = *batch;
2471
2678
  MultiGetRange range = fp.GetRange();
2472
2679
  // Initialize a new empty range. Any keys that are not in this level will
@@ -2516,19 +2723,29 @@ Status Version::ProcessBatch(
2516
2723
  leftover += ~file_range;
2517
2724
  range -= ~file_range;
2518
2725
  if (!file_range.empty()) {
2726
+ int level = fp.GetHitFileLevel();
2727
+ auto stat = mget_stats.find(level);
2728
+ if (stat == mget_stats.end()) {
2729
+ auto entry = mget_stats.insert({level, {0, 0, 0}});
2730
+ assert(entry.second);
2731
+ stat = entry.first;
2732
+ }
2733
+
2519
2734
  if (waiting.empty() && to_process.empty() &&
2520
2735
  !fp.RemainingOverlapInLevel() && leftover.empty() &&
2521
2736
  mget_tasks.empty()) {
2522
2737
  // All keys are in one SST file, so take the fast path
2523
2738
  s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(),
2524
2739
  skip_filters, skip_range_deletions, f, *blob_ctxs,
2525
- table_handle, num_filter_read, num_index_read,
2526
- num_sst_read);
2740
+ table_handle, std::get<0>(stat->second),
2741
+ std::get<1>(stat->second),
2742
+ std::get<2>(stat->second));
2527
2743
  } else {
2528
2744
  mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
2529
2745
  read_options, file_range, fp.GetHitFileLevel(), skip_filters,
2530
- skip_range_deletions, f, *blob_ctxs, table_handle, num_filter_read,
2531
- num_index_read, num_sst_read));
2746
+ skip_range_deletions, f, *blob_ctxs, table_handle,
2747
+ std::get<0>(stat->second), std::get<1>(stat->second),
2748
+ std::get<2>(stat->second)));
2532
2749
  ++num_tasks_queued;
2533
2750
  }
2534
2751
  }
@@ -2538,8 +2755,9 @@ Status Version::ProcessBatch(
2538
2755
  f = fp.GetNextFileInLevel();
2539
2756
  }
2540
2757
  // Split the current batch only if some keys are likely in this level and
2541
- // some are not.
2542
- if (s.ok() && !leftover.empty() && !range.empty()) {
2758
+ // some are not. Only split if we're done with this level, i.e f is null.
2759
+ // Otherwise, it means there are more files in this level to look at.
2760
+ if (s.ok() && !f && !leftover.empty() && !range.empty()) {
2543
2761
  fp.ReplaceRange(range);
2544
2762
  batches.emplace_back(&leftover, fp);
2545
2763
  to_process.emplace_back(batches.size() - 1);
@@ -2565,9 +2783,7 @@ Status Version::MultiGetAsync(
2565
2783
  std::deque<size_t> to_process;
2566
2784
  Status s;
2567
2785
  std::vector<folly::coro::Task<Status>> mget_tasks;
2568
- uint64_t num_filter_read = 0;
2569
- uint64_t num_index_read = 0;
2570
- uint64_t num_sst_read = 0;
2786
+ std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>> mget_stats;
2571
2787
 
2572
2788
  // Create the initial batch with the input range
2573
2789
  batches.emplace_back(range, &storage_info_.level_files_brief_,
@@ -2577,6 +2793,11 @@ Status Version::MultiGetAsync(
2577
2793
  to_process.emplace_back(0);
2578
2794
 
2579
2795
  while (!to_process.empty()) {
2796
+ // As we process a batch, it may get split into two. So reserve space for
2797
+ // an additional batch in the autovector in order to prevent later moves
2798
+ // of elements in ProcessBatch().
2799
+ batches.reserve(batches.size() + 1);
2800
+
2580
2801
  size_t idx = to_process.front();
2581
2802
  FilePickerMultiGet* batch = &batches.at(idx);
2582
2803
  unsigned int num_tasks_queued = 0;
@@ -2589,20 +2810,10 @@ Status Version::MultiGetAsync(
2589
2810
  // Look through one level. This may split the batch and enqueue it to
2590
2811
  // to_process
2591
2812
  s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
2592
- to_process, num_tasks_queued, num_filter_read,
2593
- num_index_read, num_sst_read);
2813
+ to_process, num_tasks_queued, mget_stats);
2594
2814
  if (!s.ok()) {
2595
2815
  break;
2596
2816
  }
2597
- // Dump the stats since the search has moved to the next level
2598
- if (num_filter_read + num_index_read) {
2599
- RecordInHistogram(db_statistics_,
2600
- NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2601
- num_index_read + num_filter_read);
2602
- }
2603
- if (num_sst_read) {
2604
- RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2605
- }
2606
2817
  // If ProcessBatch didn't enqueue any coroutine tasks, it means all
2607
2818
  // keys were filtered out. So put the batch back in to_process to
2608
2819
  // lookup in the next level
@@ -2649,6 +2860,30 @@ Status Version::MultiGetAsync(
2649
2860
  }
2650
2861
  }
2651
2862
 
2863
+ uint64_t num_levels = 0;
2864
+ for (auto& stat : mget_stats) {
2865
+ if (stat.first == 0) {
2866
+ num_levels += std::get<2>(stat.second);
2867
+ } else {
2868
+ num_levels++;
2869
+ }
2870
+
2871
+ uint64_t num_meta_reads =
2872
+ std::get<0>(stat.second) + std::get<1>(stat.second);
2873
+ uint64_t num_sst_reads = std::get<2>(stat.second);
2874
+ if (num_meta_reads > 0) {
2875
+ RecordInHistogram(db_statistics_,
2876
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2877
+ num_meta_reads);
2878
+ }
2879
+ if (num_sst_reads > 0) {
2880
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads);
2881
+ }
2882
+ }
2883
+ if (num_levels > 0) {
2884
+ RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels);
2885
+ }
2886
+
2652
2887
  return s;
2653
2888
  }
2654
2889
  #endif
@@ -4562,7 +4797,7 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
4562
4797
 
4563
4798
  Status VersionSet::ProcessManifestWrites(
4564
4799
  std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
4565
- FSDirectory* db_directory, bool new_descriptor_log,
4800
+ FSDirectory* dir_contains_current_file, bool new_descriptor_log,
4566
4801
  const ColumnFamilyOptions* new_cf_options) {
4567
4802
  mu->AssertHeld();
4568
4803
  assert(!writers.empty());
@@ -4893,7 +5128,7 @@ Status VersionSet::ProcessManifestWrites(
4893
5128
  }
4894
5129
  if (s.ok() && new_descriptor_log) {
4895
5130
  io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
4896
- db_directory);
5131
+ dir_contains_current_file);
4897
5132
  if (!io_s.ok()) {
4898
5133
  s = io_s;
4899
5134
  }
@@ -5120,8 +5355,8 @@ Status VersionSet::LogAndApply(
5120
5355
  const autovector<ColumnFamilyData*>& column_family_datas,
5121
5356
  const autovector<const MutableCFOptions*>& mutable_cf_options_list,
5122
5357
  const autovector<autovector<VersionEdit*>>& edit_lists,
5123
- InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log,
5124
- const ColumnFamilyOptions* new_cf_options,
5358
+ InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
5359
+ bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options,
5125
5360
  const std::vector<std::function<void(const Status&)>>& manifest_wcbs) {
5126
5361
  mu->AssertHeld();
5127
5362
  int num_edits = 0;
@@ -5195,9 +5430,8 @@ Status VersionSet::LogAndApply(
5195
5430
  }
5196
5431
  return Status::ColumnFamilyDropped();
5197
5432
  }
5198
-
5199
- return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
5200
- new_cf_options);
5433
+ return ProcessManifestWrites(writers, mu, dir_contains_current_file,
5434
+ new_descriptor_log, new_cf_options);
5201
5435
  }
5202
5436
 
5203
5437
  void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
@@ -6079,7 +6313,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
6079
6313
  TableCache* table_cache = v->cfd_->table_cache();
6080
6314
  if (table_cache != nullptr) {
6081
6315
  result = table_cache->ApproximateOffsetOf(
6082
- key, f.file_metadata->fd, caller, icmp,
6316
+ key, *f.file_metadata, caller, icmp,
6083
6317
  v->GetMutableCFOptions().prefix_extractor);
6084
6318
  }
6085
6319
  }
@@ -6119,7 +6353,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
6119
6353
  return 0;
6120
6354
  }
6121
6355
  return table_cache->ApproximateSize(
6122
- start, end, f.file_metadata->fd, caller, icmp,
6356
+ start, end, *f.file_metadata, caller, icmp,
6123
6357
  v->GetMutableCFOptions().prefix_extractor);
6124
6358
  }
6125
6359
 
@@ -6245,16 +6479,16 @@ InternalIterator* VersionSet::MakeInputIterator(
6245
6479
  for (size_t i = 0; i < flevel->num_files; i++) {
6246
6480
  const FileMetaData& fmd = *flevel->files[i].file_metadata;
6247
6481
  if (start.has_value() &&
6248
- cfd->user_comparator()->Compare(start.value(),
6249
- fmd.largest.user_key()) > 0) {
6482
+ cfd->user_comparator()->CompareWithoutTimestamp(
6483
+ start.value(), fmd.largest.user_key()) > 0) {
6250
6484
  continue;
6251
6485
  }
6252
6486
  // We should be able to filter out the case where the end key
6253
6487
  // equals to the end boundary, since the end key is exclusive.
6254
6488
  // We try to be extra safe here.
6255
6489
  if (end.has_value() &&
6256
- cfd->user_comparator()->Compare(end.value(),
6257
- fmd.smallest.user_key()) < 0) {
6490
+ cfd->user_comparator()->CompareWithoutTimestamp(
6491
+ end.value(), fmd.smallest.user_key()) < 0) {
6258
6492
  continue;
6259
6493
  }
6260
6494