@nxtedition/rocksdb 13.1.4 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/binding.cc +43 -16
  2. package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  7. package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
  8. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
  9. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
  10. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
  11. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  12. package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
  13. package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
  14. package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
  15. package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
  16. package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
  39. package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
  54. package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
  55. package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
  56. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
  57. package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
  58. package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
  59. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
  60. package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
  61. package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
  62. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  63. package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
  65. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
  66. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
  67. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
  68. package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
  69. package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
  70. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
  71. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
  72. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
  73. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
  75. package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
  76. package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
  77. package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
  78. package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
  79. package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
  80. package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
  81. package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
  82. package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
  83. package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
  84. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
  85. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
  86. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
  87. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
  88. package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
  89. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
  90. package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
  91. package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
  92. package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
  93. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
  94. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  95. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
  96. package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
  97. package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
  98. package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
  99. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
  100. package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
  101. package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
  102. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
  103. package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
  104. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
  105. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
  106. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
  107. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
  108. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
  109. package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
  110. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
  111. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
  113. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
  114. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
  115. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
  118. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
  119. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
  120. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
  121. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
  122. package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
  123. package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
  124. package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
  125. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  126. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
  127. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
  128. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
  129. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
  130. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
  131. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  132. package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
  134. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
  135. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
  137. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
  140. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
  142. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
  143. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
  144. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
  145. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
  147. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
  148. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
  149. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
  150. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
  151. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
  152. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  153. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
  154. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
  155. package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
  156. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
  157. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
  158. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
  159. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
  160. package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
  161. package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
  162. package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
  163. package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
  164. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
  165. package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
  166. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
  167. package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
  168. package/deps/rocksdb/rocksdb/port/port.h +5 -9
  169. package/deps/rocksdb/rocksdb/src.mk +8 -0
  170. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
  171. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
  172. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
  173. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
  174. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
  175. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
  176. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
  177. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
  178. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
  179. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
  180. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
  181. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
  182. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  183. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
  184. package/deps/rocksdb/rocksdb/table/format.cc +3 -3
  185. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
  186. package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
  187. package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
  188. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
  189. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  190. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
  191. package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
  192. package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
  193. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
  194. package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
  195. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
  196. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
  197. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
  198. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
  199. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
  200. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
  201. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
  202. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
  203. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
  204. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
  205. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
  206. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
  207. package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
  208. package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
  209. package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
  210. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
  211. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
  212. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
  213. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
  214. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
  215. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
  216. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
  217. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
  218. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
  219. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
  220. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
  221. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
  222. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
  223. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
  224. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
  225. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
  226. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
  227. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
  228. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
  229. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
  230. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
  231. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
  232. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
  233. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
  234. package/deps/rocksdb/rocksdb.gyp +2 -0
  235. package/package.json +1 -1
  236. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  237. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -67,7 +67,6 @@ Status ExternalSstFileIngestionJob::Prepare(
67
67
  files_to_ingest_.emplace_back(std::move(file_to_ingest));
68
68
  }
69
69
 
70
- const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
71
70
  auto num_files = files_to_ingest_.size();
72
71
  if (num_files == 0) {
73
72
  return Status::InvalidArgument("The list of files is empty");
@@ -78,16 +77,12 @@ Status ExternalSstFileIngestionJob::Prepare(
78
77
  sorted_files.push_back(&files_to_ingest_[i]);
79
78
  }
80
79
 
81
- std::sort(
82
- sorted_files.begin(), sorted_files.end(),
83
- [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
84
- return sstableKeyCompare(ucmp, info1->smallest_internal_key,
85
- info2->smallest_internal_key) < 0;
86
- });
80
+ std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_);
87
81
 
88
82
  for (size_t i = 0; i + 1 < num_files; i++) {
89
- if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
90
- sorted_files[i + 1]->smallest_internal_key) >= 0) {
83
+ if (file_range_checker_.OverlapsWithPrev(sorted_files[i],
84
+ sorted_files[i + 1],
85
+ /* ranges_sorted= */ true)) {
91
86
  files_overlap_ = true;
92
87
  break;
93
88
  }
@@ -100,7 +95,15 @@ Status ExternalSstFileIngestionJob::Prepare(
100
95
  "behind mode.");
101
96
  }
102
97
 
103
- if (ucmp->timestamp_size() > 0 && files_overlap_) {
98
+ // Overlapping files need at least two different sequence numbers. If settings
99
+ // disables global seqno, ingestion will fail anyway, so fail fast in prepare.
100
+ if (!ingestion_options_.allow_global_seqno && files_overlap_) {
101
+ return Status::InvalidArgument(
102
+ "Global seqno is required, but disabled (because external files key "
103
+ "range overlaps).");
104
+ }
105
+
106
+ if (ucmp_->timestamp_size() > 0 && files_overlap_) {
104
107
  return Status::NotSupported(
105
108
  "Files with overlapping ranges cannot be ingested to column "
106
109
  "family with user-defined timestamp enabled.");
@@ -336,9 +339,35 @@ Status ExternalSstFileIngestionJob::Prepare(
336
339
  }
337
340
  }
338
341
 
342
+ if (status.ok()) {
343
+ DivideInputFilesIntoBatches();
344
+ }
345
+
339
346
  return status;
340
347
  }
341
348
 
349
+ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
350
+ if (!files_overlap_) {
351
+ // No overlap, treat as one batch without the need of tracking overall batch
352
+ // range.
353
+ file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ false);
354
+ for (auto& file : files_to_ingest_) {
355
+ file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
356
+ }
357
+ return;
358
+ }
359
+
360
+ file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
361
+ for (auto& file : files_to_ingest_) {
362
+ if (file_range_checker_.OverlapsWithPrev(&file_batches_to_ingest_.back(),
363
+ &file,
364
+ /* ranges_sorted= */ false)) {
365
+ file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
366
+ }
367
+ file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
368
+ }
369
+ }
370
+
342
371
  Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
343
372
  SuperVersion* super_version) {
344
373
  size_t n = files_to_ingest_.size();
@@ -353,9 +382,7 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
353
382
  if (!ingestion_options_.allow_blocking_flush) {
354
383
  status = Status::InvalidArgument("External file requires flush");
355
384
  }
356
- auto ucmp = cfd_->user_comparator();
357
- assert(ucmp);
358
- if (ucmp->timestamp_size() > 0) {
385
+ if (ucmp_->timestamp_size() > 0) {
359
386
  status = Status::InvalidArgument(
360
387
  "Column family enables user-defined timestamps, please make "
361
388
  "sure the key range (without timestamp) of external file does not "
@@ -368,8 +395,16 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
368
395
  // REQUIRES: we have become the only writer by entering both write_thread_ and
369
396
  // nonmem_write_thread_
370
397
  Status ExternalSstFileIngestionJob::Run() {
371
- Status status;
372
398
  SuperVersion* super_version = cfd_->GetSuperVersion();
399
+ // If column family is flushed after Prepare and before Run, we should have a
400
+ // specific state of Memtables. The mutable Memtable should be empty, and the
401
+ // immutable Memtable list should be empty.
402
+ if (flushed_before_run_ && (super_version->imm->NumNotFlushed() != 0 ||
403
+ !super_version->mem->IsEmpty())) {
404
+ return Status::TryAgain(
405
+ "Inconsistent memtable state detected when flushed before run.");
406
+ }
407
+ Status status;
373
408
  #ifndef NDEBUG
374
409
  // We should never run the job with a memtable that is overlapping
375
410
  // with the files we are ingesting
@@ -397,14 +432,39 @@ Status ExternalSstFileIngestionJob::Run() {
397
432
  edit_.SetColumnFamily(cfd_->GetID());
398
433
  // The levels that the files will be ingested into
399
434
 
400
- for (IngestedFileInfo& f : files_to_ingest_) {
435
+ std::optional<int> prev_batch_uppermost_level;
436
+ for (auto& batch : file_batches_to_ingest_) {
437
+ int batch_uppermost_level = 0;
438
+ status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno,
439
+ &last_seqno, &batch_uppermost_level,
440
+ prev_batch_uppermost_level);
441
+ if (!status.ok()) {
442
+ return status;
443
+ }
444
+
445
+ prev_batch_uppermost_level = batch_uppermost_level;
446
+ }
447
+
448
+ CreateEquivalentFileIngestingCompactions();
449
+ return status;
450
+ }
451
+
452
+ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
453
+ FileBatchInfo& batch, SuperVersion* super_version, bool force_global_seqno,
454
+ SequenceNumber* last_seqno, int* batch_uppermost_level,
455
+ std::optional<int> prev_batch_uppermost_level) {
456
+ Status status;
457
+ assert(batch_uppermost_level);
458
+ *batch_uppermost_level = std::numeric_limits<int>::max();
459
+ for (IngestedFileInfo* file : batch.files) {
460
+ assert(file);
401
461
  SequenceNumber assigned_seqno = 0;
402
462
  if (ingestion_options_.ingest_behind) {
403
- status = CheckLevelForIngestedBehindFile(&f);
463
+ status = CheckLevelForIngestedBehindFile(file);
404
464
  } else {
405
465
  status = AssignLevelAndSeqnoForIngestedFile(
406
466
  super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
407
- last_seqno, &f, &assigned_seqno);
467
+ *last_seqno, file, &assigned_seqno, prev_batch_uppermost_level);
408
468
  }
409
469
 
410
470
  // Modify the smallest/largest internal key to include the sequence number
@@ -413,38 +473,38 @@ Status ExternalSstFileIngestionJob::Run() {
413
473
  // exclusive endpoint.
414
474
  ParsedInternalKey smallest_parsed, largest_parsed;
415
475
  if (status.ok()) {
416
- status = ParseInternalKey(*f.smallest_internal_key.rep(),
476
+ status = ParseInternalKey(*(file->smallest_internal_key.rep()),
417
477
  &smallest_parsed, false /* log_err_key */);
418
478
  }
419
479
  if (status.ok()) {
420
- status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
421
- false /* log_err_key */);
480
+ status = ParseInternalKey(*(file->largest_internal_key.rep()),
481
+ &largest_parsed, false /* log_err_key */);
422
482
  }
423
483
  if (!status.ok()) {
424
484
  return status;
425
485
  }
426
486
  if (smallest_parsed.sequence == 0 && assigned_seqno != 0) {
427
- UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
487
+ UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno,
428
488
  smallest_parsed.type);
429
489
  }
430
490
  if (largest_parsed.sequence == 0 && assigned_seqno != 0) {
431
- UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
491
+ UpdateInternalKey(file->largest_internal_key.rep(), assigned_seqno,
432
492
  largest_parsed.type);
433
493
  }
434
494
 
435
- status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
495
+ status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno);
436
496
  if (!status.ok()) {
437
497
  return status;
438
498
  }
439
499
  TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
440
500
  &assigned_seqno);
441
- assert(assigned_seqno == 0 || assigned_seqno == last_seqno + 1);
442
- if (assigned_seqno > last_seqno) {
443
- last_seqno = assigned_seqno;
501
+ assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1);
502
+ if (assigned_seqno > *last_seqno) {
503
+ *last_seqno = assigned_seqno;
444
504
  ++consumed_seqno_count_;
445
505
  }
446
506
 
447
- status = GenerateChecksumForIngestedFile(&f);
507
+ status = GenerateChecksumForIngestedFile(file);
448
508
  if (!status.ok()) {
449
509
  return status;
450
510
  }
@@ -459,31 +519,40 @@ Status ExternalSstFileIngestionJob::Run() {
459
519
  static_cast<uint64_t>(temp_current_time);
460
520
  }
461
521
  uint64_t tail_size = 0;
462
- bool contain_no_data_blocks = f.table_properties.num_entries > 0 &&
463
- (f.table_properties.num_entries ==
464
- f.table_properties.num_range_deletions);
465
- if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) {
466
- uint64_t file_size = f.fd.GetFileSize();
467
- assert(f.table_properties.tail_start_offset <= file_size);
468
- tail_size = file_size - f.table_properties.tail_start_offset;
522
+ bool contain_no_data_blocks = file->table_properties.num_entries > 0 &&
523
+ (file->table_properties.num_entries ==
524
+ file->table_properties.num_range_deletions);
525
+ if (file->table_properties.tail_start_offset > 0 ||
526
+ contain_no_data_blocks) {
527
+ uint64_t file_size = file->fd.GetFileSize();
528
+ assert(file->table_properties.tail_start_offset <= file_size);
529
+ tail_size = file_size - file->table_properties.tail_start_offset;
469
530
  }
470
531
 
532
+ bool marked_for_compaction =
533
+ file->table_properties.num_range_deletions == 1 &&
534
+ (file->table_properties.num_entries ==
535
+ file->table_properties.num_range_deletions);
471
536
  FileMetaData f_metadata(
472
- f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
473
- f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
474
- f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
475
- oldest_ancester_time, current_time,
537
+ file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(),
538
+ file->smallest_internal_key, file->largest_internal_key,
539
+ file->assigned_seqno, file->assigned_seqno, false,
540
+ file->file_temperature, kInvalidBlobFileNumber, oldest_ancester_time,
541
+ current_time,
476
542
  ingestion_options_.ingest_behind
477
543
  ? kReservedEpochNumberForFileIngestedBehind
478
544
  : cfd_->NewEpochNumber(),
479
- f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size,
480
- f.user_defined_timestamps_persisted);
481
- f_metadata.temperature = f.file_temperature;
482
- edit_.AddFile(f.picked_level, f_metadata);
545
+ file->file_checksum, file->file_checksum_func_name, file->unique_id, 0,
546
+ tail_size, file->user_defined_timestamps_persisted);
547
+ f_metadata.temperature = file->file_temperature;
548
+ f_metadata.marked_for_compaction = marked_for_compaction;
549
+ edit_.AddFile(file->picked_level, f_metadata);
550
+
551
+ *batch_uppermost_level =
552
+ std::min(*batch_uppermost_level, file->picked_level);
483
553
  }
484
554
 
485
- CreateEquivalentFileIngestingCompactions();
486
- return status;
555
+ return Status::OK();
487
556
  }
488
557
 
489
558
  void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
@@ -519,20 +588,17 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
519
588
  file_ingesting_compactions_.push_back(new Compaction(
520
589
  cfd_->current()->storage_info(), *cfd_->ioptions(), mutable_cf_options,
521
590
  mutable_db_options_, {input}, output_level,
522
- MaxFileSizeForLevel(
523
- mutable_cf_options, output_level,
524
- cfd_->ioptions()->compaction_style) /* output file size
525
- limit,
526
- * not applicable
527
- */
528
- ,
591
+ /* output file size limit not applicable */
592
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
593
+ cfd_->ioptions()->compaction_style),
529
594
  LLONG_MAX /* max compaction bytes, not applicable */,
530
595
  0 /* output path ID, not applicable */, mutable_cf_options.compression,
531
596
  mutable_cf_options.compression_opts,
532
597
  mutable_cf_options.default_write_temperature,
533
598
  0 /* max_subcompaction, not applicable */,
534
- {} /* grandparents, not applicable */, false /* is manual */,
535
- "" /* trim_ts */, -1 /* score, not applicable */,
599
+ {} /* grandparents, not applicable */,
600
+ std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
601
+ false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
536
602
  false /* is deletion compaction, not applicable */,
537
603
  files_overlap_ /* l0_files_might_overlap, not applicable */,
538
604
  CompactionReason::kExternalSstIngestion));
@@ -679,7 +745,10 @@ Status ExternalSstFileIngestionJob::ResetTableReader(
679
745
  new RandomAccessFileReader(std::move(sst_file), external_file,
680
746
  nullptr /*Env*/, io_tracer_));
681
747
  table_reader->reset();
682
- status = cfd_->ioptions()->table_factory->NewTableReader(
748
+ ReadOptions ro;
749
+ ro.fill_cache = ingestion_options_.fill_cache;
750
+ status = sv->mutable_cf_options.table_factory->NewTableReader(
751
+ ro,
683
752
  TableReaderOptions(
684
753
  *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
685
754
  env_options_, cfd_->internal_comparator(),
@@ -691,7 +760,9 @@ Status ExternalSstFileIngestionJob::ResetTableReader(
691
760
  /*cur_file_num*/ new_file_number,
692
761
  /* unique_id */ {}, /* largest_seqno */ 0,
693
762
  /* tail_size */ 0, user_defined_timestamps_persisted),
694
- std::move(sst_file_reader), file_to_ingest->file_size, table_reader);
763
+ std::move(sst_file_reader), file_to_ingest->file_size, table_reader,
764
+ // No need to prefetch index/filter if caching is not needed.
765
+ /*prefetch_index_and_filter_in_cache=*/ingestion_options_.fill_cache);
695
766
  return status;
696
767
  }
697
768
 
@@ -707,6 +778,7 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
707
778
  // Get table version
708
779
  auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
709
780
  if (version_iter == uprops.end()) {
781
+ assert(!SstFileWriter::CreatedBySstFileWriter(*props));
710
782
  if (!ingestion_options_.allow_db_generated_files) {
711
783
  return Status::Corruption("External file version not found");
712
784
  } else {
@@ -715,6 +787,7 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
715
787
  file_to_ingest->version = 0;
716
788
  }
717
789
  } else {
790
+ assert(SstFileWriter::CreatedBySstFileWriter(*props));
718
791
  file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
719
792
  }
720
793
 
@@ -787,9 +860,7 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
787
860
  // `TableReader` is initialized with `user_defined_timestamps_persisted` flag
788
861
  // to be true. If its value changed to false after this sanity check, we
789
862
  // need to reset the `TableReader`.
790
- auto ucmp = cfd_->user_comparator();
791
- assert(ucmp);
792
- if (ucmp->timestamp_size() > 0 &&
863
+ if (ucmp_->timestamp_size() > 0 &&
793
864
  !file_to_ingest->user_defined_timestamps_persisted) {
794
865
  s = ResetTableReader(external_file, new_file_number,
795
866
  file_to_ingest->user_defined_timestamps_persisted, sv,
@@ -839,6 +910,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
839
910
  // TODO: plumb Env::IOActivity, Env::IOPriority
840
911
  ReadOptions ro;
841
912
  ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
913
+ ro.fill_cache = ingestion_options_.fill_cache;
842
914
  status = table_reader->VerifyChecksum(
843
915
  ro, TableReaderCaller::kExternalSSTIngestion);
844
916
  if (!status.ok()) {
@@ -849,16 +921,12 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
849
921
  ParsedInternalKey key;
850
922
  // TODO: plumb Env::IOActivity, Env::IOPriority
851
923
  ReadOptions ro;
924
+ ro.fill_cache = ingestion_options_.fill_cache;
852
925
  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
853
926
  ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
854
927
  /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
855
928
 
856
929
  // Get first (smallest) and last (largest) key from file.
857
- file_to_ingest->smallest_internal_key =
858
- InternalKey("", 0, ValueType::kTypeValue);
859
- file_to_ingest->largest_internal_key =
860
- InternalKey("", 0, ValueType::kTypeValue);
861
- bool bounds_set = false;
862
930
  bool allow_data_in_errors = db_options_.allow_data_in_errors;
863
931
  iter->SeekToFirst();
864
932
  if (iter->Valid()) {
@@ -874,7 +942,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
874
942
  file_to_ingest->smallest_internal_key.SetFrom(key);
875
943
 
876
944
  Slice largest;
877
- if (strcmp(cfd_->ioptions()->table_factory->Name(), "PlainTable") == 0) {
945
+ if (strcmp(sv->mutable_cf_options.table_factory->Name(), "PlainTable") ==
946
+ 0) {
878
947
  // PlainTable iterator does not support SeekToLast().
879
948
  largest = iter->key();
880
949
  for (; iter->Valid(); iter->Next()) {
@@ -908,8 +977,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
908
977
  return Status::Corruption("External file has non zero sequence number");
909
978
  }
910
979
  file_to_ingest->largest_internal_key.SetFrom(key);
911
-
912
- bounds_set = true;
913
980
  } else if (!iter->status().ok()) {
914
981
  return iter->status();
915
982
  }
@@ -946,7 +1013,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
946
1013
  table_reader->NewRangeTombstoneIterator(ro));
947
1014
  // We may need to adjust these key bounds, depending on whether any range
948
1015
  // deletion tombstones extend past them.
949
- const Comparator* ucmp = cfd_->user_comparator();
950
1016
  if (range_del_iter != nullptr) {
951
1017
  for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
952
1018
  range_del_iter->Next()) {
@@ -962,24 +1028,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
962
1028
  "number.");
963
1029
  }
964
1030
  RangeTombstone tombstone(key, range_del_iter->value());
965
-
966
- InternalKey start_key = tombstone.SerializeKey();
967
- if (!bounds_set ||
968
- sstableKeyCompare(ucmp, start_key,
969
- file_to_ingest->smallest_internal_key) < 0) {
970
- file_to_ingest->smallest_internal_key = start_key;
971
- }
972
- InternalKey end_key = tombstone.SerializeEndKey();
973
- if (!bounds_set ||
974
- sstableKeyCompare(ucmp, end_key,
975
- file_to_ingest->largest_internal_key) > 0) {
976
- file_to_ingest->largest_internal_key = end_key;
977
- }
978
- bounds_set = true;
1031
+ file_range_checker_.MaybeUpdateRange(tombstone.SerializeKey(),
1032
+ tombstone.SerializeEndKey(),
1033
+ file_to_ingest);
979
1034
  }
980
1035
  }
981
1036
 
982
- const size_t ts_sz = ucmp->timestamp_size();
1037
+ const size_t ts_sz = ucmp_->timestamp_size();
983
1038
  Slice smallest = file_to_ingest->smallest_internal_key.user_key();
984
1039
  Slice largest = file_to_ingest->largest_internal_key.user_key();
985
1040
  if (ts_sz > 0) {
@@ -1008,16 +1063,19 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
1008
1063
  Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
1009
1064
  SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
1010
1065
  SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
1011
- SequenceNumber* assigned_seqno) {
1066
+ SequenceNumber* assigned_seqno,
1067
+ std::optional<int> prev_batch_uppermost_level) {
1012
1068
  Status status;
1013
1069
  *assigned_seqno = 0;
1014
- auto ucmp = cfd_->user_comparator();
1015
- const size_t ts_sz = ucmp->timestamp_size();
1070
+ const size_t ts_sz = ucmp_->timestamp_size();
1071
+ assert(!prev_batch_uppermost_level.has_value() ||
1072
+ prev_batch_uppermost_level.value() < cfd_->NumberLevels());
1073
+ bool must_assign_to_l0 = prev_batch_uppermost_level.has_value() &&
1074
+ prev_batch_uppermost_level.value() == 0;
1016
1075
  if (force_global_seqno || files_overlap_ ||
1017
- compaction_style == kCompactionStyleFIFO) {
1076
+ compaction_style == kCompactionStyleFIFO || must_assign_to_l0) {
1018
1077
  *assigned_seqno = last_seqno + 1;
1019
- // If files overlap, we have to ingest them at level 0.
1020
- if (files_overlap_ || compaction_style == kCompactionStyleFIFO) {
1078
+ if (compaction_style == kCompactionStyleFIFO || must_assign_to_l0) {
1021
1079
  assert(ts_sz == 0);
1022
1080
  file_to_ingest->picked_level = 0;
1023
1081
  if (ingestion_options_.fail_if_not_bottommost_level &&
@@ -1034,11 +1092,16 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
1034
1092
  Arena arena;
1035
1093
  // TODO: plumb Env::IOActivity, Env::IOPriority
1036
1094
  ReadOptions ro;
1095
+ ro.fill_cache = ingestion_options_.fill_cache;
1037
1096
  ro.total_order_seek = true;
1038
1097
  int target_level = 0;
1039
1098
  auto* vstorage = cfd_->current()->storage_info();
1099
+ assert(!must_assign_to_l0);
1100
+ int exclusive_end_level = prev_batch_uppermost_level.has_value()
1101
+ ? prev_batch_uppermost_level.value()
1102
+ : cfd_->NumberLevels();
1040
1103
 
1041
- for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
1104
+ for (int lvl = 0; lvl < exclusive_end_level; lvl++) {
1042
1105
  if (lvl > 0 && lvl < vstorage->base_level()) {
1043
1106
  continue;
1044
1107
  }
@@ -1065,8 +1128,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
1065
1128
  overlap_with_db = true;
1066
1129
  break;
1067
1130
  }
1068
- } else if (compaction_style == kCompactionStyleUniversal) {
1069
- continue;
1070
1131
  }
1071
1132
 
1072
1133
  // We don't overlap with any keys in this level, but we still need to check
@@ -25,13 +25,74 @@ namespace ROCKSDB_NAMESPACE {
25
25
  class Directories;
26
26
  class SystemClock;
27
27
 
28
- struct IngestedFileInfo {
29
- // External file path
30
- std::string external_file_path;
31
- // Smallest internal key in external file
28
+ struct KeyRangeInfo {
29
+ // Smallest internal key in an external file or for a batch of external files.
32
30
  InternalKey smallest_internal_key;
33
- // Largest internal key in external file
31
+ // Largest internal key in an external file or for a batch of external files.
34
32
  InternalKey largest_internal_key;
33
+
34
+ bool empty() const {
35
+ return smallest_internal_key.size() == 0 &&
36
+ largest_internal_key.size() == 0;
37
+ }
38
+ };
39
+
40
+ // Helper class to apply SST file key range checks to the external files.
41
+ class ExternalFileRangeChecker {
42
+ public:
43
+ explicit ExternalFileRangeChecker(const Comparator* ucmp) : ucmp_(ucmp) {}
44
+
45
+ // Operator used for sorting ranges.
46
+ bool operator()(const KeyRangeInfo* prev_range,
47
+ const KeyRangeInfo* range) const {
48
+ assert(prev_range);
49
+ assert(range);
50
+ return sstableKeyCompare(ucmp_, prev_range->smallest_internal_key,
51
+ range->smallest_internal_key) < 0;
52
+ }
53
+
54
+ // Check whether `range` overlaps with `prev_range`. `ranges_sorted` can be
55
+ // set to true when the inputs are already sorted based on the sorting logic
56
+ // provided by this checker's operator(), which can help simplify the check.
57
+ bool OverlapsWithPrev(const KeyRangeInfo* prev_range,
58
+ const KeyRangeInfo* range,
59
+ bool ranges_sorted = false) const {
60
+ assert(prev_range);
61
+ assert(range);
62
+ if (prev_range->empty() || range->empty()) {
63
+ return false;
64
+ }
65
+ if (ranges_sorted) {
66
+ return sstableKeyCompare(ucmp_, prev_range->largest_internal_key,
67
+ range->smallest_internal_key) >= 0;
68
+ }
69
+
70
+ return sstableKeyCompare(ucmp_, prev_range->largest_internal_key,
71
+ range->smallest_internal_key) >= 0 &&
72
+ sstableKeyCompare(ucmp_, prev_range->smallest_internal_key,
73
+ range->largest_internal_key) <= 0;
74
+ }
75
+
76
+ void MaybeUpdateRange(const InternalKey& start_key,
77
+ const InternalKey& end_key, KeyRangeInfo* range) const {
78
+ assert(range);
79
+ if (range->smallest_internal_key.size() == 0 ||
80
+ sstableKeyCompare(ucmp_, start_key, range->smallest_internal_key) < 0) {
81
+ range->smallest_internal_key = start_key;
82
+ }
83
+ if (range->largest_internal_key.size() == 0 ||
84
+ sstableKeyCompare(ucmp_, end_key, range->largest_internal_key) > 0) {
85
+ range->largest_internal_key = end_key;
86
+ }
87
+ }
88
+
89
+ private:
90
+ const Comparator* ucmp_;
91
+ };
92
+
93
+ struct IngestedFileInfo : public KeyRangeInfo {
94
+ // External file path
95
+ std::string external_file_path;
35
96
  // NOTE: use below two fields for all `*Overlap*` types of checks instead of
36
97
  // smallest_internal_key.user_key() and largest_internal_key.user_key().
37
98
  // The smallest / largest user key contained in the file for key range checks.
@@ -94,6 +155,30 @@ struct IngestedFileInfo {
94
155
  bool user_defined_timestamps_persisted = true;
95
156
  };
96
157
 
158
+ // A batch of files.
159
+ struct FileBatchInfo : public KeyRangeInfo {
160
+ autovector<IngestedFileInfo*> files;
161
+ // When true, `smallest_internal_key` and `largest_internal_key` will be
162
+ // tracked and updated as new file get added via `AddFile`. When false, we
163
+ // bypass this tracking. This is used when the all input external files
164
+ // are already checked and not overlapping, and they just need to be added
165
+ // into one default batch.
166
+ bool track_batch_range;
167
+
168
+ void AddFile(IngestedFileInfo* file,
169
+ const ExternalFileRangeChecker& key_range_checker) {
170
+ assert(file);
171
+ files.push_back(file);
172
+ if (track_batch_range) {
173
+ key_range_checker.MaybeUpdateRange(file->smallest_internal_key,
174
+ file->largest_internal_key, this);
175
+ }
176
+ }
177
+
178
+ explicit FileBatchInfo(bool _track_batch_range)
179
+ : track_batch_range(_track_batch_range) {}
180
+ };
181
+
97
182
  class ExternalSstFileIngestionJob {
98
183
  public:
99
184
  ExternalSstFileIngestionJob(
@@ -108,6 +193,8 @@ class ExternalSstFileIngestionJob {
108
193
  fs_(db_options.fs, io_tracer),
109
194
  versions_(versions),
110
195
  cfd_(cfd),
196
+ ucmp_(cfd ? cfd->user_comparator() : nullptr),
197
+ file_range_checker_(ucmp_),
111
198
  db_options_(db_options),
112
199
  mutable_db_options_(mutable_db_options),
113
200
  env_options_(env_options),
@@ -119,10 +206,14 @@ class ExternalSstFileIngestionJob {
119
206
  consumed_seqno_count_(0),
120
207
  io_tracer_(io_tracer) {
121
208
  assert(directories != nullptr);
209
+ assert(cfd_);
210
+ assert(ucmp_);
122
211
  }
123
212
 
124
213
  ~ExternalSstFileIngestionJob() { UnregisterRange(); }
125
214
 
215
+ ColumnFamilyData* GetColumnFamilyData() const { return cfd_; }
216
+
126
217
  // Prepare the job by copying external files into the DB.
127
218
  Status Prepare(const std::vector<std::string>& external_files_paths,
128
219
  const std::vector<std::string>& files_checksums,
@@ -140,6 +231,8 @@ class ExternalSstFileIngestionJob {
140
231
  // Thread-safe
141
232
  Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
142
233
 
234
+ void SetFlushedBeforeRun() { flushed_before_run_ = true; }
235
+
143
236
  // Will execute the ingestion job and prepare edit() to be applied.
144
237
  // REQUIRES: Mutex held
145
238
  Status Run();
@@ -194,15 +287,38 @@ class ExternalSstFileIngestionJob {
194
287
  IngestedFileInfo* file_to_ingest,
195
288
  SuperVersion* sv);
196
289
 
290
+ // If the input files' key range overlaps themselves, this function divides
291
+ // them in the user specified order into multiple batches. Where the files
292
+ // within a batch do not overlap with each other, but key range could overlap
293
+ // between batches.
294
+ // If the input files' key range don't overlap themselves, they always just
295
+ // make one batch.
296
+ void DivideInputFilesIntoBatches();
297
+
298
+ // Assign level for the files in one batch. The files within one batch are not
299
+ // overlapping, and we assign level to each file one after another.
300
+ // If `prev_batch_uppermost_level` is specified, all files in this batch will
301
+ // be assigned to levels that are higher than `prev_batch_uppermost_level`.
302
+ // The uppermost level used by this batch of files is tracked too, so that it
303
+ // can be used by the next batch.
304
+ // REQUIRES: Mutex held
305
+ Status AssignLevelsForOneBatch(FileBatchInfo& batch,
306
+ SuperVersion* super_version,
307
+ bool force_global_seqno,
308
+ SequenceNumber* last_seqno,
309
+ int* batch_uppermost_level,
310
+ std::optional<int> prev_batch_uppermost_level);
311
+
197
312
  // Assign `file_to_ingest` the appropriate sequence number and the lowest
198
313
  // possible level that it can be ingested to according to compaction_style.
314
+ // If `prev_batch_uppermost_level` is specified, the file will only be
315
+ // assigned to levels tha are higher than `prev_batch_uppermost_level`.
199
316
  // REQUIRES: Mutex held
200
- Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
201
- bool force_global_seqno,
202
- CompactionStyle compaction_style,
203
- SequenceNumber last_seqno,
204
- IngestedFileInfo* file_to_ingest,
205
- SequenceNumber* assigned_seqno);
317
+ Status AssignLevelAndSeqnoForIngestedFile(
318
+ SuperVersion* sv, bool force_global_seqno,
319
+ CompactionStyle compaction_style, SequenceNumber last_seqno,
320
+ IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno,
321
+ std::optional<int> prev_batch_uppermost_level);
206
322
 
207
323
  // File that we want to ingest behind always goes to the lowest level;
208
324
  // we just check that it fits in the level, that DB allows ingest_behind,
@@ -237,11 +353,14 @@ class ExternalSstFileIngestionJob {
237
353
  FileSystemPtr fs_;
238
354
  VersionSet* versions_;
239
355
  ColumnFamilyData* cfd_;
356
+ const Comparator* ucmp_;
357
+ ExternalFileRangeChecker file_range_checker_;
240
358
  const ImmutableDBOptions& db_options_;
241
359
  const MutableDBOptions& mutable_db_options_;
242
360
  const EnvOptions& env_options_;
243
361
  SnapshotList* db_snapshots_;
244
362
  autovector<IngestedFileInfo> files_to_ingest_;
363
+ std::vector<FileBatchInfo> file_batches_to_ingest_;
245
364
  const IngestExternalFileOptions& ingestion_options_;
246
365
  Directories* directories_;
247
366
  EventLogger* event_logger_;
@@ -256,6 +375,10 @@ class ExternalSstFileIngestionJob {
256
375
  bool need_generate_file_checksum_{true};
257
376
  std::shared_ptr<IOTracer> io_tracer_;
258
377
 
378
+ // Flag indicating whether the column family is flushed after `Prepare` and
379
+ // before `Run`.
380
+ bool flushed_before_run_{false};
381
+
259
382
  // Below are variables used in (un)registering range for this ingestion job
260
383
  //
261
384
  // FileMetaData used in inputs of compactions equivalent to this ingestion