@nxtedition/rocksdb 13.5.13 → 15.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/binding.cc +55 -180
  2. package/binding.gyp +2 -2
  3. package/chained-batch.js +9 -16
  4. package/deps/rocksdb/rocksdb/BUCK +18 -1
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
  6. package/deps/rocksdb/rocksdb/Makefile +20 -9
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +90 -13
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -75
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.h +44 -36
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +184 -148
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +5 -11
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +116 -47
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +3 -6
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -1
  16. package/deps/rocksdb/rocksdb/db/builder.cc +4 -2
  17. package/deps/rocksdb/rocksdb/db/c.cc +207 -0
  18. package/deps/rocksdb/rocksdb/db/c_test.c +72 -0
  19. package/deps/rocksdb/rocksdb/db/column_family.cc +3 -2
  20. package/deps/rocksdb/rocksdb/db/column_family.h +5 -0
  21. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +2 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +51 -38
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +29 -12
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +5 -10
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +566 -366
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +131 -4
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +1 -0
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +4 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +13 -14
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +12 -7
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -10
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +97 -76
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +11 -14
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +16 -3
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +1 -0
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +448 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +22 -20
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +4 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +5 -5
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +7 -3
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_iter.cc +104 -0
  47. package/deps/rocksdb/rocksdb/db/db_iter.h +4 -11
  48. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +331 -58
  49. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +129 -0
  50. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +64 -0
  51. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +40 -0
  52. package/deps/rocksdb/rocksdb/db/db_test2.cc +25 -15
  53. package/deps/rocksdb/rocksdb/db/db_test_util.cc +42 -24
  54. package/deps/rocksdb/rocksdb/db/db_test_util.h +29 -14
  55. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +69 -36
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +0 -1
  57. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  58. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -4
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +8 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +275 -79
  61. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +23 -5
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +591 -175
  63. package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -4
  64. package/deps/rocksdb/rocksdb/db/log_reader.cc +5 -2
  65. package/deps/rocksdb/rocksdb/db/memtable.cc +84 -35
  66. package/deps/rocksdb/rocksdb/db/memtable.h +39 -34
  67. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -0
  68. package/deps/rocksdb/rocksdb/db/merge_operator.cc +1 -1
  69. package/deps/rocksdb/rocksdb/db/multi_scan.cc +11 -5
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +1 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -1
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +34 -14
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +28 -5
  74. package/deps/rocksdb/rocksdb/db/version_set.cc +159 -14
  75. package/deps/rocksdb/rocksdb/db/version_set.h +2 -0
  76. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -1
  77. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +60 -0
  78. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +16 -1
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +75 -10
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.cc +28 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +2 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +31 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +50 -2
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +57 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +0 -4
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +266 -35
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -6
  89. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +18 -2
  90. package/deps/rocksdb/rocksdb/env/env.cc +12 -0
  91. package/deps/rocksdb/rocksdb/env/env_test.cc +18 -0
  92. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +2 -0
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +9 -5
  94. package/deps/rocksdb/rocksdb/env/io_posix.cc +4 -2
  95. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +19 -0
  96. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -31
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +42 -9
  98. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +93 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +43 -49
  100. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +4 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +8 -6
  102. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +487 -0
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +11 -12
  104. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +135 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +12 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +8 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +12 -8
  110. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +3 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +19 -9
  112. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +219 -24
  113. package/deps/rocksdb/rocksdb/include/rocksdb/point_lock_bench_tool.h +14 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +2 -2
  115. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +1 -1
  116. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +7 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +16 -0
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +16 -4
  119. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +13 -0
  120. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +4 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +0 -2
  122. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +45 -0
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +1 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +1 -1
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +6 -1
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  128. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +3 -3
  129. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +77 -51
  130. package/deps/rocksdb/rocksdb/memtable/skiplist.h +10 -13
  131. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +16 -7
  132. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +9 -4
  133. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +2 -0
  134. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +6 -0
  135. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -1
  136. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  137. package/deps/rocksdb/rocksdb/options/options.cc +2 -0
  138. package/deps/rocksdb/rocksdb/options/options_helper.cc +9 -8
  139. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -5
  140. package/deps/rocksdb/rocksdb/port/mmap.cc +1 -1
  141. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +51 -0
  142. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +4 -0
  143. package/deps/rocksdb/rocksdb/src.mk +8 -2
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1125 -765
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +35 -24
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +29 -4
  147. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +732 -256
  148. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +225 -16
  149. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -26
  150. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -1
  151. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -75
  152. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +433 -141
  153. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +2 -0
  154. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +17 -10
  155. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy_impl.h +20 -0
  156. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +112 -85
  157. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +191 -36
  158. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +2 -2
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  160. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +108 -31
  161. package/deps/rocksdb/rocksdb/table/external_table.cc +7 -3
  162. package/deps/rocksdb/rocksdb/table/format.cc +6 -12
  163. package/deps/rocksdb/rocksdb/table/format.h +10 -0
  164. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  165. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +1 -1
  166. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -1
  167. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +5 -0
  168. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -1
  169. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +118 -46
  170. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +9 -8
  171. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  172. package/deps/rocksdb/rocksdb/table/table_properties.cc +16 -0
  173. package/deps/rocksdb/rocksdb/table/table_test.cc +1540 -155
  174. package/deps/rocksdb/rocksdb/test_util/testutil.h +21 -5
  175. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +26 -5
  176. package/deps/rocksdb/rocksdb/tools/ldb.cc +1 -2
  177. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +2 -0
  178. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -3
  179. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +133 -165
  180. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +173 -64
  181. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +69 -0
  182. package/deps/rocksdb/rocksdb/util/atomic.h +6 -0
  183. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +29 -20
  184. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +10 -6
  185. package/deps/rocksdb/rocksdb/util/bit_fields.h +338 -0
  186. package/deps/rocksdb/rocksdb/util/coding.h +3 -3
  187. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -2
  188. package/deps/rocksdb/rocksdb/util/compression.cc +777 -82
  189. package/deps/rocksdb/rocksdb/util/compression.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/compression_test.cc +5 -3
  191. package/deps/rocksdb/rocksdb/util/dynamic_bloom.cc +2 -2
  192. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +15 -14
  193. package/deps/rocksdb/rocksdb/util/interval_test.cc +102 -0
  194. package/deps/rocksdb/rocksdb/util/semaphore.h +164 -0
  195. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +10 -6
  196. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -2
  197. package/deps/rocksdb/rocksdb/util/slice_test.cc +136 -0
  198. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  199. package/deps/rocksdb/rocksdb/util/string_util.cc +2 -16
  200. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +1 -1
  201. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
  202. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +7 -4
  203. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +35 -14
  204. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_test.cc +2 -0
  205. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +5 -2
  206. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/any_lock_manager_test.h +244 -0
  207. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench.cc +18 -0
  208. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench_tool.cc +159 -0
  209. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +1244 -161
  210. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +66 -12
  211. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_stress_test.cc +103 -0
  212. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +1275 -8
  213. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +40 -262
  214. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test_common.h +78 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_validation_test_runner.h +469 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -6
  217. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +4 -0
  218. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +9 -1
  219. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +18 -9
  220. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +2 -0
  221. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +2 -1
  222. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +72 -44
  223. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +92 -15
  224. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +6 -20
  225. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +143 -112
  226. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +23 -16
  227. package/index.js +18 -42
  228. package/package.json +1 -1
  229. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  230. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  231. package/util.h +38 -12
  232. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc +0 -17
@@ -170,7 +170,11 @@ CompactionJob::CompactionJob(
170
170
  blob_output_directory_(blob_output_directory),
171
171
  db_mutex_(db_mutex),
172
172
  db_error_handler_(db_error_handler),
173
- earliest_snapshot_(job_context->GetEarliestSnapshotSequence()),
173
+ // job_context cannot be nullptr, but we will assert later in the body of
174
+ // the constructor.
175
+ earliest_snapshot_(job_context
176
+ ? job_context->GetEarliestSnapshotSequence()
177
+ : kMaxSequenceNumber),
174
178
  job_context_(job_context),
175
179
  table_cache_(std::move(table_cache)),
176
180
  event_logger_(event_logger),
@@ -185,6 +189,7 @@ CompactionJob::CompactionJob(
185
189
  bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
186
190
  assert(job_stats_ != nullptr);
187
191
  assert(log_buffer_ != nullptr);
192
+ assert(job_context);
188
193
  assert(job_context->snapshot_context_initialized);
189
194
 
190
195
  const auto* cfd = compact_->compaction->column_family_data();
@@ -668,16 +673,17 @@ void CompactionJob::GenSubcompactionBoundaries() {
668
673
  extra_num_subcompaction_threads_reserved_));
669
674
  }
670
675
 
671
- Status CompactionJob::Run() {
676
+ void CompactionJob::InitializeCompactionRun() {
672
677
  AutoThreadOperationStageUpdater stage_updater(
673
678
  ThreadStatus::STAGE_COMPACTION_RUN);
674
679
  TEST_SYNC_POINT("CompactionJob::Run():Start");
675
680
  log_buffer_->FlushBufferToLog();
676
681
  LogCompaction();
682
+ }
677
683
 
684
+ void CompactionJob::RunSubcompactions() {
678
685
  const size_t num_threads = compact_->sub_compact_states.size();
679
686
  assert(num_threads > 0);
680
- const uint64_t start_micros = db_options_.clock->NowMicros();
681
687
  compact_->compaction->GetOrInitInputTableProperties();
682
688
 
683
689
  // Launch a thread for each of subcompactions 1...num_threads-1
@@ -696,25 +702,43 @@ Status CompactionJob::Run() {
696
702
  for (auto& thread : thread_pool) {
697
703
  thread.join();
698
704
  }
705
+ RemoveEmptyOutputs();
706
+
707
+ ReleaseSubcompactionResources();
708
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources");
709
+ }
699
710
 
711
+ void CompactionJob::UpdateTimingStats(uint64_t start_micros) {
700
712
  internal_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
701
713
 
702
714
  for (auto& state : compact_->sub_compact_states) {
703
715
  internal_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
704
- state.RemoveLastEmptyOutput();
705
716
  }
706
717
 
707
718
  RecordTimeToHistogram(stats_, COMPACTION_TIME,
708
719
  internal_stats_.output_level_stats.micros);
709
720
  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
710
721
  internal_stats_.output_level_stats.cpu_micros);
722
+ }
711
723
 
712
- TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
724
+ void CompactionJob::RemoveEmptyOutputs() {
725
+ for (auto& state : compact_->sub_compact_states) {
726
+ state.RemoveLastEmptyOutput();
727
+ }
728
+ }
729
+
730
+ bool CompactionJob::HasNewBlobFiles() const {
731
+ for (const auto& state : compact_->sub_compact_states) {
732
+ if (state.Current().HasBlobFileAdditions()) {
733
+ return true;
734
+ }
735
+ }
736
+ return false;
737
+ }
713
738
 
714
- // Check if any thread encountered an error during execution
739
+ Status CompactionJob::CollectSubcompactionErrors() {
715
740
  Status status;
716
741
  IOStatus io_s;
717
- bool wrote_new_blob_files = false;
718
742
 
719
743
  for (const auto& state : compact_->sub_compact_states) {
720
744
  if (!state.status.ok()) {
@@ -722,125 +746,131 @@ Status CompactionJob::Run() {
722
746
  io_s = state.io_status;
723
747
  break;
724
748
  }
725
-
726
- if (state.Current().HasBlobFileAdditions()) {
727
- wrote_new_blob_files = true;
728
- }
729
749
  }
730
750
 
731
751
  if (io_status_.ok()) {
732
752
  io_status_ = io_s;
733
753
  }
734
- if (status.ok()) {
735
- constexpr IODebugContext* dbg = nullptr;
736
754
 
737
- if (output_directory_) {
738
- io_s = output_directory_->FsyncWithDirOptions(
739
- IOOptions(), dbg,
740
- DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
741
- }
755
+ return status;
756
+ }
742
757
 
743
- if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
744
- blob_output_directory_ != output_directory_) {
745
- io_s = blob_output_directory_->FsyncWithDirOptions(
746
- IOOptions(), dbg,
747
- DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
748
- }
758
+ Status CompactionJob::SyncOutputDirectories() {
759
+ Status status;
760
+ IOStatus io_s;
761
+ constexpr IODebugContext* dbg = nullptr;
762
+ const bool wrote_new_blob_files = HasNewBlobFiles();
763
+ if (output_directory_) {
764
+ io_s = output_directory_->FsyncWithDirOptions(
765
+ IOOptions(), dbg,
766
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
749
767
  }
768
+
769
+ if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
770
+ blob_output_directory_ != output_directory_) {
771
+ io_s = blob_output_directory_->FsyncWithDirOptions(
772
+ IOOptions(), dbg,
773
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
774
+ }
775
+
750
776
  if (io_status_.ok()) {
751
777
  io_status_ = io_s;
752
778
  }
753
779
  if (status.ok()) {
754
780
  status = io_s;
755
781
  }
756
- if (status.ok()) {
757
- thread_pool.clear();
758
- std::vector<const CompactionOutputs::Output*> files_output;
759
- for (const auto& state : compact_->sub_compact_states) {
760
- for (const auto& output : state.GetOutputs()) {
761
- files_output.emplace_back(&output);
762
- }
782
+
783
+ return status;
784
+ }
785
+
786
+ Status CompactionJob::VerifyOutputFiles() {
787
+ Status status;
788
+ std::vector<port::Thread> thread_pool;
789
+ std::vector<const CompactionOutputs::Output*> files_output;
790
+ for (const auto& state : compact_->sub_compact_states) {
791
+ for (const auto& output : state.GetOutputs()) {
792
+ files_output.emplace_back(&output);
763
793
  }
764
- ColumnFamilyData* cfd = compact_->compaction->column_family_data();
765
- std::atomic<size_t> next_file_idx(0);
766
- auto verify_table = [&](Status& output_status) {
767
- while (true) {
768
- size_t file_idx = next_file_idx.fetch_add(1);
769
- if (file_idx >= files_output.size()) {
770
- break;
771
- }
772
- // Verify that the table is usable
773
- // We set for_compaction to false and don't
774
- // OptimizeForCompactionTableRead here because this is a special case
775
- // after we finish the table building No matter whether
776
- // use_direct_io_for_flush_and_compaction is true, we will regard this
777
- // verification as user reads since the goal is to cache it here for
778
- // further user reads
779
- ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
780
- verify_table_read_options.rate_limiter_priority =
781
- GetRateLimiterPriority();
782
- InternalIterator* iter = cfd->table_cache()->NewIterator(
783
- verify_table_read_options, file_options_,
784
- cfd->internal_comparator(), files_output[file_idx]->meta,
785
- /*range_del_agg=*/nullptr,
786
- compact_->compaction->mutable_cf_options(),
787
- /*table_reader_ptr=*/nullptr,
788
- cfd->internal_stats()->GetFileReadHist(
789
- compact_->compaction->output_level()),
790
- TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
791
- /*skip_filters=*/false, compact_->compaction->output_level(),
792
- MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()),
793
- /*smallest_compaction_key=*/nullptr,
794
- /*largest_compaction_key=*/nullptr,
795
- /*allow_unprepared_value=*/false);
796
- auto s = iter->status();
797
-
798
- if (s.ok() && paranoid_file_checks_) {
799
- OutputValidator validator(cfd->internal_comparator(),
800
- /*_enable_hash=*/true);
801
- for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
802
- s = validator.Add(iter->key(), iter->value());
803
- if (!s.ok()) {
804
- break;
805
- }
806
- }
807
- if (s.ok()) {
808
- s = iter->status();
809
- }
810
- if (s.ok() &&
811
- !validator.CompareValidator(files_output[file_idx]->validator)) {
812
- s = Status::Corruption("Paranoid checksums do not match");
794
+ }
795
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
796
+ std::atomic<size_t> next_file_idx(0);
797
+ auto verify_table = [&](Status& output_status) {
798
+ while (true) {
799
+ size_t file_idx = next_file_idx.fetch_add(1);
800
+ if (file_idx >= files_output.size()) {
801
+ break;
802
+ }
803
+ // Verify that the table is usable
804
+ // We set for_compaction to false and don't
805
+ // OptimizeForCompactionTableRead here because this is a special case
806
+ // after we finish the table building No matter whether
807
+ // use_direct_io_for_flush_and_compaction is true, we will regard this
808
+ // verification as user reads since the goal is to cache it here for
809
+ // further user reads
810
+ ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
811
+ verify_table_read_options.rate_limiter_priority =
812
+ GetRateLimiterPriority();
813
+ InternalIterator* iter = cfd->table_cache()->NewIterator(
814
+ verify_table_read_options, file_options_, cfd->internal_comparator(),
815
+ files_output[file_idx]->meta,
816
+ /*range_del_agg=*/nullptr, compact_->compaction->mutable_cf_options(),
817
+ /*table_reader_ptr=*/nullptr,
818
+ cfd->internal_stats()->GetFileReadHist(
819
+ compact_->compaction->output_level()),
820
+ TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
821
+ /*skip_filters=*/false, compact_->compaction->output_level(),
822
+ MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()),
823
+ /*smallest_compaction_key=*/nullptr,
824
+ /*largest_compaction_key=*/nullptr,
825
+ /*allow_unprepared_value=*/false);
826
+ auto s = iter->status();
827
+
828
+ if (s.ok() && paranoid_file_checks_) {
829
+ OutputValidator validator(cfd->internal_comparator(),
830
+ /*_enable_hash=*/true);
831
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
832
+ s = validator.Add(iter->key(), iter->value());
833
+ if (!s.ok()) {
834
+ break;
813
835
  }
814
836
  }
815
-
816
- delete iter;
817
-
818
- if (!s.ok()) {
819
- output_status = s;
820
- break;
837
+ if (s.ok()) {
838
+ s = iter->status();
839
+ }
840
+ if (s.ok() &&
841
+ !validator.CompareValidator(files_output[file_idx]->validator)) {
842
+ s = Status::Corruption("Paranoid checksums do not match");
821
843
  }
822
844
  }
823
- };
824
- for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
825
- thread_pool.emplace_back(
826
- verify_table, std::ref(compact_->sub_compact_states[i].status));
827
- }
828
- verify_table(compact_->sub_compact_states[0].status);
829
- for (auto& thread : thread_pool) {
830
- thread.join();
831
- }
832
845
 
833
- for (const auto& state : compact_->sub_compact_states) {
834
- if (!state.status.ok()) {
835
- status = state.status;
846
+ delete iter;
847
+
848
+ if (!s.ok()) {
849
+ output_status = s;
836
850
  break;
837
851
  }
838
852
  }
853
+ };
854
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
855
+ thread_pool.emplace_back(verify_table,
856
+ std::ref(compact_->sub_compact_states[i].status));
857
+ }
858
+ verify_table(compact_->sub_compact_states[0].status);
859
+ for (auto& thread : thread_pool) {
860
+ thread.join();
839
861
  }
840
862
 
841
- ReleaseSubcompactionResources();
842
- TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources");
863
+ for (const auto& state : compact_->sub_compact_states) {
864
+ if (!state.status.ok()) {
865
+ status = state.status;
866
+ break;
867
+ }
868
+ }
843
869
 
870
+ return status;
871
+ }
872
+
873
+ void CompactionJob::SetOutputTableProperties() {
844
874
  for (const auto& state : compact_->sub_compact_states) {
845
875
  for (const auto& output : state.GetOutputs()) {
846
876
  auto fn =
@@ -850,7 +880,9 @@ Status CompactionJob::Run() {
850
880
  output.table_properties);
851
881
  }
852
882
  }
883
+ }
853
884
 
885
+ void CompactionJob::AggregateSubcompactionOutputAndJobStats() {
854
886
  // Before the compaction starts, is_remote_compaction was set to true if
855
887
  // compaction_service is set. We now know whether each sub_compaction was
856
888
  // done remotely or not. Reset is_remote_compaction back to false and allow
@@ -859,68 +891,88 @@ Status CompactionJob::Run() {
859
891
 
860
892
  // Finish up all bookkeeping to unify the subcompaction results.
861
893
  compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
894
+ }
862
895
 
863
- uint64_t num_input_range_del = 0;
864
- bool ok = BuildStatsFromInputTableProperties(&num_input_range_del);
865
- // (Sub)compactions returned ok, do sanity check on the number of input
866
- // keys.
867
- if (status.ok() && ok) {
868
- if (job_stats_->has_num_input_records) {
869
- status = VerifyInputRecordCount(num_input_range_del);
870
- if (!status.ok()) {
871
- ROCKS_LOG_WARN(
872
- db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
873
- compact_->compaction->column_family_data()->GetName().c_str(),
874
- job_context_->job_id, status.ToString().c_str());
875
- }
896
+ Status CompactionJob::VerifyCompactionRecordCounts(
897
+ bool stats_built_from_input_table_prop, uint64_t num_input_range_del) {
898
+ Status status;
899
+ if (stats_built_from_input_table_prop &&
900
+ job_stats_->has_accurate_num_input_records) {
901
+ status = VerifyInputRecordCount(num_input_range_del);
902
+ if (!status.ok()) {
903
+ return status;
876
904
  }
877
- UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
878
905
  }
879
- UpdateCompactionJobOutputStats(internal_stats_);
880
906
 
881
- // Verify number of output records
882
- // Only verify on table with format collects table properties
883
907
  const auto& mutable_cf_options = compact_->compaction->mutable_cf_options();
884
- if (status.ok() &&
885
- (mutable_cf_options.table_factory->IsInstanceOf(
908
+ if ((mutable_cf_options.table_factory->IsInstanceOf(
886
909
  TableFactory::kBlockBasedTableName()) ||
887
910
  mutable_cf_options.table_factory->IsInstanceOf(
888
- TableFactory::kPlainTableName())) &&
889
- db_options_.compaction_verify_record_count) {
890
- uint64_t total_output_num = 0;
891
- for (const auto& state : compact_->sub_compact_states) {
892
- for (const auto& output : state.GetOutputs()) {
893
- total_output_num += output.table_properties->num_entries -
894
- output.table_properties->num_range_deletions;
895
- }
896
- }
897
-
898
- uint64_t expected = internal_stats_.output_level_stats.num_output_records;
899
- if (internal_stats_.has_proximal_level_output) {
900
- expected += internal_stats_.proximal_level_stats.num_output_records;
901
- }
902
- if (expected != total_output_num) {
903
- char scratch[2345];
904
- compact_->compaction->Summary(scratch, sizeof(scratch));
905
- std::string msg =
906
- "Number of keys in compaction output SST files does not match "
907
- "number of keys added. Expected " +
908
- std::to_string(expected) + " but there are " +
909
- std::to_string(total_output_num) +
910
- " in output SST files. Compaction summary: " + scratch;
911
- ROCKS_LOG_WARN(
912
- db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
913
- compact_->compaction->column_family_data()->GetName().c_str(),
914
- job_context_->job_id, msg.c_str());
915
- status = Status::Corruption(msg);
911
+ TableFactory::kPlainTableName()))) {
912
+ status = VerifyOutputRecordCount();
913
+ if (!status.ok()) {
914
+ return status;
916
915
  }
917
916
  }
917
+ return status;
918
+ }
918
919
 
920
+ void CompactionJob::FinalizeCompactionRun(
921
+ const Status& input_status, bool stats_built_from_input_table_prop,
922
+ uint64_t num_input_range_del) {
923
+ if (stats_built_from_input_table_prop) {
924
+ UpdateCompactionJobInputStatsFromInternalStats(internal_stats_,
925
+ num_input_range_del);
926
+ }
927
+ UpdateCompactionJobOutputStatsFromInternalStats(internal_stats_);
919
928
  RecordCompactionIOStats();
929
+
920
930
  LogFlush(db_options_.info_log);
921
931
  TEST_SYNC_POINT("CompactionJob::Run():End");
922
- compact_->status = status;
923
- TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status);
932
+ compact_->status = input_status;
933
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet",
934
+ const_cast<Status*>(&input_status));
935
+ }
936
+
937
+ Status CompactionJob::Run() {
938
+ InitializeCompactionRun();
939
+
940
+ const uint64_t start_micros = db_options_.clock->NowMicros();
941
+
942
+ RunSubcompactions();
943
+
944
+ UpdateTimingStats(start_micros);
945
+
946
+ TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
947
+
948
+ Status status = CollectSubcompactionErrors();
949
+
950
+ if (status.ok()) {
951
+ status = SyncOutputDirectories();
952
+ }
953
+
954
+ if (status.ok()) {
955
+ status = VerifyOutputFiles();
956
+ }
957
+
958
+ if (status.ok()) {
959
+ SetOutputTableProperties();
960
+ }
961
+
962
+ AggregateSubcompactionOutputAndJobStats();
963
+
964
+ uint64_t num_input_range_del = 0;
965
+ bool stats_built_from_input_table_prop =
966
+ UpdateInternalStatsFromInputFiles(&num_input_range_del);
967
+
968
+ if (status.ok()) {
969
+ status = VerifyCompactionRecordCounts(stats_built_from_input_table_prop,
970
+ num_input_range_del);
971
+ }
972
+
973
+ FinalizeCompactionRun(status, stats_built_from_input_table_prop,
974
+ num_input_range_del);
975
+
924
976
  return status;
925
977
  }
926
978
 
@@ -1144,58 +1196,62 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
1144
1196
  }
1145
1197
  }
1146
1198
 
1147
- void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1148
- assert(sub_compact);
1149
- assert(sub_compact->compaction);
1199
+ bool CompactionJob::ShouldUseLocalCompaction(SubcompactionState* sub_compact) {
1150
1200
  if (db_options_.compaction_service) {
1151
1201
  CompactionServiceJobStatus comp_status =
1152
1202
  ProcessKeyValueCompactionWithCompactionService(sub_compact);
1153
1203
  if (comp_status != CompactionServiceJobStatus::kUseLocal) {
1154
- return;
1204
+ return false;
1155
1205
  }
1156
1206
  // fallback to local compaction
1157
1207
  assert(comp_status == CompactionServiceJobStatus::kUseLocal);
1158
1208
  sub_compact->compaction_job_stats.is_remote_compaction = false;
1159
1209
  }
1210
+ return true;
1211
+ }
1160
1212
 
1161
- uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
1213
+ CompactionJob::CompactionIOStatsSnapshot CompactionJob::InitializeIOStats() {
1214
+ CompactionIOStatsSnapshot io_stats;
1162
1215
 
1163
- ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
1216
+ if (measure_io_stats_) {
1217
+ io_stats.prev_perf_level = GetPerfLevel();
1218
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
1219
+ io_stats.prev_write_nanos = IOSTATS(write_nanos);
1220
+ io_stats.prev_fsync_nanos = IOSTATS(fsync_nanos);
1221
+ io_stats.prev_range_sync_nanos = IOSTATS(range_sync_nanos);
1222
+ io_stats.prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
1223
+ io_stats.prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
1224
+ io_stats.prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
1225
+ }
1226
+
1227
+ return io_stats;
1228
+ }
1229
+
1230
+ Status CompactionJob::SetupAndValidateCompactionFilter(
1231
+ SubcompactionState* sub_compact,
1232
+ const CompactionFilter* configured_compaction_filter,
1233
+ const CompactionFilter*& compaction_filter,
1234
+ std::unique_ptr<CompactionFilter>& compaction_filter_from_factory) {
1235
+ compaction_filter = configured_compaction_filter;
1164
1236
 
1165
- // Create compaction filter and fail the compaction if
1166
- // IgnoreSnapshots() = false because it is not supported anymore
1167
- const CompactionFilter* compaction_filter = cfd->ioptions().compaction_filter;
1168
- std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
1169
1237
  if (compaction_filter == nullptr) {
1170
1238
  compaction_filter_from_factory =
1171
1239
  sub_compact->compaction->CreateCompactionFilter();
1172
1240
  compaction_filter = compaction_filter_from_factory.get();
1173
1241
  }
1242
+
1174
1243
  if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
1175
- sub_compact->status = Status::NotSupported(
1244
+ return Status::NotSupported(
1176
1245
  "CompactionFilter::IgnoreSnapshots() = false is not supported "
1177
1246
  "anymore.");
1178
- return;
1179
1247
  }
1180
1248
 
1181
- NotifyOnSubcompactionBegin(sub_compact);
1182
-
1183
- // This is assigned after creation of SubcompactionState to simplify that
1184
- // creation across both CompactionJob and CompactionServiceCompactionJob
1185
- sub_compact->AssignRangeDelAggregator(
1186
- std::make_unique<CompactionRangeDelAggregator>(
1187
- &cfd->internal_comparator(), job_context_->snapshot_seqs,
1188
- &full_history_ts_low_, &trim_ts_));
1189
-
1190
- // TODO: since we already use C++17, should use
1191
- // std::optional<const Slice> instead.
1192
- const std::optional<Slice> start = sub_compact->start;
1193
- const std::optional<Slice> end = sub_compact->end;
1194
-
1195
- std::optional<Slice> start_without_ts;
1196
- std::optional<Slice> end_without_ts;
1249
+ return Status::OK();
1250
+ }
1197
1251
 
1198
- ReadOptions read_options;
1252
+ void CompactionJob::InitializeReadOptions(
1253
+ ColumnFamilyData* cfd, ReadOptions& read_options,
1254
+ SubcompactionKeyBoundaries& boundaries) {
1199
1255
  read_options.verify_checksums = true;
1200
1256
  read_options.fill_cache = false;
1201
1257
  read_options.rate_limiter_priority = GetRateLimiterPriority();
@@ -1206,223 +1262,207 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1206
1262
  // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
1207
1263
  read_options.total_order_seek = true;
1208
1264
 
1209
- const WriteOptions write_options(Env::IOPriority::IO_LOW,
1210
- Env::IOActivity::kCompaction);
1211
-
1212
1265
  // Remove the timestamps from boundaries because boundaries created in
1213
1266
  // GenSubcompactionBoundaries doesn't strip away the timestamp.
1214
- size_t ts_sz = cfd->user_comparator()->timestamp_size();
1215
- if (start.has_value()) {
1216
- read_options.iterate_lower_bound = &(*start);
1267
+ const size_t ts_sz = cfd->user_comparator()->timestamp_size();
1268
+
1269
+ if (boundaries.start.has_value()) {
1270
+ read_options.iterate_lower_bound = &(*boundaries.start);
1217
1271
  if (ts_sz > 0) {
1218
- start_without_ts = StripTimestampFromUserKey(*start, ts_sz);
1219
- read_options.iterate_lower_bound = &(*start_without_ts);
1272
+ boundaries.start_without_ts =
1273
+ StripTimestampFromUserKey(*boundaries.start, ts_sz);
1274
+ read_options.iterate_lower_bound = &(*boundaries.start_without_ts);
1220
1275
  }
1221
1276
  }
1222
- if (end.has_value()) {
1223
- read_options.iterate_upper_bound = &(*end);
1277
+ if (boundaries.end.has_value()) {
1278
+ read_options.iterate_upper_bound = &(*boundaries.end);
1224
1279
  if (ts_sz > 0) {
1225
- end_without_ts = StripTimestampFromUserKey(*end, ts_sz);
1226
- read_options.iterate_upper_bound = &(*end_without_ts);
1280
+ boundaries.end_without_ts =
1281
+ StripTimestampFromUserKey(*boundaries.end, ts_sz);
1282
+ read_options.iterate_upper_bound = &(*boundaries.end_without_ts);
1227
1283
  }
1228
1284
  }
1285
+ }
1286
+
1287
+ InternalIterator* CompactionJob::CreateInputIterator(
1288
+ SubcompactionState* sub_compact, ColumnFamilyData* cfd,
1289
+ SubcompactionInternalIterators& iterators,
1290
+ SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
1291
+ // This is assigned after creation of SubcompactionState to simplify that
1292
+ // creation across both CompactionJob and CompactionServiceCompactionJob
1293
+ sub_compact->AssignRangeDelAggregator(
1294
+ std::make_unique<CompactionRangeDelAggregator>(
1295
+ &cfd->internal_comparator(), job_context_->snapshot_seqs,
1296
+ &full_history_ts_low_, &trim_ts_));
1297
+
1298
+ InitializeReadOptions(cfd, read_options, boundaries);
1229
1299
 
1230
1300
  // Although the v2 aggregator is what the level iterator(s) know about,
1231
1301
  // the AddTombstones calls will be propagated down to the v1 aggregator.
1232
- std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
1233
- read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
1234
- file_options_for_read_, start, end));
1235
- InternalIterator* input = raw_input.get();
1236
-
1237
- IterKey start_ikey;
1238
- IterKey end_ikey;
1239
- Slice start_slice;
1240
- Slice end_slice;
1241
- Slice start_user_key{};
1242
- Slice end_user_key{};
1243
-
1244
- static constexpr char kMaxTs[] =
1245
- "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
1246
- Slice ts_slice;
1247
- std::string max_ts;
1302
+ iterators.raw_input =
1303
+ std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
1304
+ read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
1305
+ file_options_for_read_, boundaries.start, boundaries.end));
1306
+ InternalIterator* input = iterators.raw_input.get();
1307
+
1308
+ const size_t ts_sz = cfd->user_comparator()->timestamp_size();
1248
1309
  if (ts_sz > 0) {
1249
- if (ts_sz <= strlen(kMaxTs)) {
1250
- ts_slice = Slice(kMaxTs, ts_sz);
1310
+ if (ts_sz <= strlen(boundaries.kMaxTs)) {
1311
+ boundaries.ts_slice = Slice(boundaries.kMaxTs, ts_sz);
1251
1312
  } else {
1252
- max_ts = std::string(ts_sz, '\xff');
1253
- ts_slice = Slice(max_ts);
1313
+ boundaries.max_ts = std::string(ts_sz, '\xff');
1314
+ boundaries.ts_slice = Slice(boundaries.max_ts);
1254
1315
  }
1255
1316
  }
1256
1317
 
1257
- if (start.has_value()) {
1258
- start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
1318
+ if (boundaries.start.has_value()) {
1319
+ boundaries.start_ikey.SetInternalKey(*boundaries.start, kMaxSequenceNumber,
1320
+ kValueTypeForSeek);
1259
1321
  if (ts_sz > 0) {
1260
- start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
1261
- &ts_slice);
1322
+ boundaries.start_ikey.UpdateInternalKey(
1323
+ kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice);
1262
1324
  }
1263
- start_slice = start_ikey.GetInternalKey();
1264
- start_user_key = start_ikey.GetUserKey();
1325
+ boundaries.start_internal_key = boundaries.start_ikey.GetInternalKey();
1326
+ boundaries.start_user_key = boundaries.start_ikey.GetUserKey();
1265
1327
  }
1266
- if (end.has_value()) {
1267
- end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
1328
+ if (boundaries.end.has_value()) {
1329
+ boundaries.end_ikey.SetInternalKey(*boundaries.end, kMaxSequenceNumber,
1330
+ kValueTypeForSeek);
1268
1331
  if (ts_sz > 0) {
1269
- end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
1270
- &ts_slice);
1332
+ boundaries.end_ikey.UpdateInternalKey(
1333
+ kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice);
1271
1334
  }
1272
- end_slice = end_ikey.GetInternalKey();
1273
- end_user_key = end_ikey.GetUserKey();
1335
+ boundaries.end_internal_key = boundaries.end_ikey.GetInternalKey();
1336
+ boundaries.end_user_key = boundaries.end_ikey.GetUserKey();
1274
1337
  }
1275
1338
 
1276
- std::unique_ptr<InternalIterator> clip;
1277
- if (start.has_value() || end.has_value()) {
1278
- clip = std::make_unique<ClippingIterator>(
1279
- raw_input.get(), start.has_value() ? &start_slice : nullptr,
1280
- end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
1281
- input = clip.get();
1339
+ if (boundaries.start.has_value() || boundaries.end.has_value()) {
1340
+ iterators.clip = std::make_unique<ClippingIterator>(
1341
+ iterators.raw_input.get(),
1342
+ boundaries.start.has_value() ? &boundaries.start_internal_key : nullptr,
1343
+ boundaries.end.has_value() ? &boundaries.end_internal_key : nullptr,
1344
+ &cfd->internal_comparator());
1345
+ input = iterators.clip.get();
1282
1346
  }
1283
1347
 
1284
- std::unique_ptr<InternalIterator> blob_counter;
1285
-
1286
1348
  if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
1287
1349
  BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
1288
- blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
1289
- input = blob_counter.get();
1350
+ iterators.blob_counter =
1351
+ std::make_unique<BlobCountingIterator>(input, meter);
1352
+ input = iterators.blob_counter.get();
1290
1353
  }
1291
1354
 
1292
- std::unique_ptr<InternalIterator> trim_history_iter;
1293
1355
  if (ts_sz > 0 && !trim_ts_.empty()) {
1294
- trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
1356
+ iterators.trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
1295
1357
  input, cfd->user_comparator(), trim_ts_);
1296
- input = trim_history_iter.get();
1358
+ input = iterators.trim_history_iter.get();
1297
1359
  }
1298
1360
 
1299
- input->SeekToFirst();
1300
-
1301
- AutoThreadOperationStageUpdater stage_updater(
1302
- ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
1303
-
1304
- // I/O measurement variables
1305
- PerfLevel prev_perf_level = PerfLevel::kEnableTime;
1306
- const uint64_t kRecordStatsEvery = 1000;
1307
- uint64_t prev_write_nanos = 0;
1308
- uint64_t prev_fsync_nanos = 0;
1309
- uint64_t prev_range_sync_nanos = 0;
1310
- uint64_t prev_prepare_write_nanos = 0;
1311
- uint64_t prev_cpu_write_nanos = 0;
1312
- uint64_t prev_cpu_read_nanos = 0;
1313
- if (measure_io_stats_) {
1314
- prev_perf_level = GetPerfLevel();
1315
- SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
1316
- prev_write_nanos = IOSTATS(write_nanos);
1317
- prev_fsync_nanos = IOSTATS(fsync_nanos);
1318
- prev_range_sync_nanos = IOSTATS(range_sync_nanos);
1319
- prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
1320
- prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
1321
- prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
1322
- }
1323
-
1324
- MergeHelper merge(
1325
- env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
1326
- compaction_filter, db_options_.info_log.get(),
1327
- false /* internal key corruption is expected */,
1328
- job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
1329
- compact_->compaction->level(), db_options_.stats);
1361
+ return input;
1362
+ }
1330
1363
 
1364
+ void CompactionJob::CreateBlobFileBuilder(SubcompactionState* sub_compact,
1365
+ ColumnFamilyData* cfd,
1366
+ BlobFileResources& blob_resources,
1367
+ const WriteOptions& write_options) {
1331
1368
  const auto& mutable_cf_options =
1332
1369
  sub_compact->compaction->mutable_cf_options();
1333
1370
 
1334
- std::vector<std::string> blob_file_paths;
1335
-
1336
1371
  // TODO: BlobDB to support output_to_proximal_level compaction, which needs
1337
1372
  // 2 builders, so may need to move to `CompactionOutputs`
1338
- std::unique_ptr<BlobFileBuilder> blob_file_builder(
1339
- (mutable_cf_options.enable_blob_files &&
1340
- sub_compact->compaction->output_level() >=
1341
- mutable_cf_options.blob_file_starting_level)
1342
- ? new BlobFileBuilder(
1343
- versions_, fs_.get(),
1344
- &sub_compact->compaction->immutable_options(),
1345
- &mutable_cf_options, &file_options_, &write_options, db_id_,
1346
- db_session_id_, job_id_, cfd->GetID(), cfd->GetName(),
1347
- write_hint_, io_tracer_, blob_callback_,
1348
- BlobFileCreationReason::kCompaction, &blob_file_paths,
1349
- sub_compact->Current().GetBlobFileAdditionsPtr())
1350
- : nullptr);
1373
+ if (mutable_cf_options.enable_blob_files &&
1374
+ sub_compact->compaction->output_level() >=
1375
+ mutable_cf_options.blob_file_starting_level) {
1376
+ blob_resources.blob_file_builder = std::make_unique<BlobFileBuilder>(
1377
+ versions_, fs_.get(), &sub_compact->compaction->immutable_options(),
1378
+ &mutable_cf_options, &file_options_, &write_options, db_id_,
1379
+ db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_,
1380
+ io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction,
1381
+ &blob_resources.blob_file_paths,
1382
+ sub_compact->Current().GetBlobFileAdditionsPtr());
1383
+ } else {
1384
+ blob_resources.blob_file_builder = nullptr;
1385
+ }
1386
+ }
1351
1387
 
1352
- TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
1353
- TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
1354
- static_cast<void*>(const_cast<std::atomic<bool>*>(
1355
- &manual_compaction_canceled_)));
1388
+ std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
1389
+ SubcompactionState* sub_compact, ColumnFamilyData* cfd,
1390
+ InternalIterator* input, const CompactionFilter* compaction_filter,
1391
+ MergeHelper& merge, BlobFileResources& blob_resources,
1392
+ const WriteOptions& write_options) {
1393
+ CreateBlobFileBuilder(sub_compact, cfd, blob_resources, write_options);
1356
1394
 
1357
1395
  const std::string* const full_history_ts_low =
1358
1396
  full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
1359
- const SequenceNumber job_snapshot_seq =
1360
- job_context_ ? job_context_->GetJobSnapshotSequence()
1361
- : kMaxSequenceNumber;
1397
+ assert(job_context_);
1362
1398
 
1363
- auto c_iter = std::make_unique<CompactionIterator>(
1399
+ return std::make_unique<CompactionIterator>(
1364
1400
  input, cfd->user_comparator(), &merge, versions_->LastSequence(),
1365
1401
  &(job_context_->snapshot_seqs), earliest_snapshot_,
1366
- job_context_->earliest_write_conflict_snapshot, job_snapshot_seq,
1367
- job_context_->snapshot_checker, env_,
1368
- ShouldReportDetailedTime(env_, stats_),
1369
- /*expect_valid_internal_key=*/true, sub_compact->RangeDelAgg(),
1370
- blob_file_builder.get(), db_options_.allow_data_in_errors,
1402
+ job_context_->earliest_write_conflict_snapshot,
1403
+ job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
1404
+ env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
1405
+ blob_resources.blob_file_builder.get(), db_options_.allow_data_in_errors,
1371
1406
  db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
1372
- sub_compact->compaction
1373
- ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
1407
+ sub_compact->compaction->DoesInputReferenceBlobFiles(),
1374
1408
  sub_compact->compaction, compaction_filter, shutting_down_,
1375
1409
  db_options_.info_log, full_history_ts_low, preserve_seqno_after_);
1376
- c_iter->SeekToFirst();
1377
-
1378
- const auto& c_iter_stats = c_iter->iter_stats();
1410
+ }
1379
1411
 
1380
- // define the open and close functions for the compaction files, which will be
1381
- // used open/close output files when needed.
1412
+ std::pair<CompactionFileOpenFunc, CompactionFileCloseFunc>
1413
+ CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact,
1414
+ SubcompactionKeyBoundaries& boundaries) {
1382
1415
  const CompactionFileOpenFunc open_file_func =
1383
1416
  [this, sub_compact](CompactionOutputs& outputs) {
1384
1417
  return this->OpenCompactionOutputFile(sub_compact, outputs);
1385
1418
  };
1386
1419
 
1420
+ const Slice* start_user_key =
1421
+ sub_compact->start.has_value() ? &boundaries.start_user_key : nullptr;
1422
+ const Slice* end_user_key =
1423
+ sub_compact->end.has_value() ? &boundaries.end_user_key : nullptr;
1424
+
1387
1425
  const CompactionFileCloseFunc close_file_func =
1388
1426
  [this, sub_compact, start_user_key, end_user_key](
1389
1427
  CompactionOutputs& outputs, const Status& status,
1390
1428
  const Slice& next_table_min_key) {
1391
- return this->FinishCompactionOutputFile(
1392
- status, sub_compact, outputs, next_table_min_key,
1393
- sub_compact->start.has_value() ? &start_user_key : nullptr,
1394
- sub_compact->end.has_value() ? &end_user_key : nullptr);
1429
+ return this->FinishCompactionOutputFile(status, sub_compact, outputs,
1430
+ next_table_min_key,
1431
+ start_user_key, end_user_key);
1395
1432
  };
1396
1433
 
1434
+ return {open_file_func, close_file_func};
1435
+ }
1436
+
1437
+ Status CompactionJob::ProcessKeyValue(
1438
+ SubcompactionState* sub_compact, ColumnFamilyData* cfd,
1439
+ CompactionIterator* c_iter, const CompactionFileOpenFunc& open_file_func,
1440
+ const CompactionFileCloseFunc& close_file_func, uint64_t& prev_cpu_micros) {
1397
1441
  Status status;
1442
+ const uint64_t kRecordStatsEvery = 1000;
1443
+ [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
1444
+
1398
1445
  TEST_SYNC_POINT_CALLBACK(
1399
1446
  "CompactionJob::ProcessKeyValueCompaction()::Processing",
1400
1447
  static_cast<void*>(const_cast<Compaction*>(sub_compact->compaction)));
1401
- uint64_t last_cpu_micros = prev_cpu_micros;
1402
- while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
1403
- // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
1404
- // returns true.
1448
+
1449
+ while (status.ok() && !cfd->IsDropped() && c_iter->Valid() &&
1450
+ c_iter->status().ok()) {
1405
1451
  assert(!end.has_value() ||
1406
1452
  cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
1407
1453
 
1408
- if (c_iter_stats.num_input_records % kRecordStatsEvery ==
1454
+ if (c_iter->iter_stats().num_input_records % kRecordStatsEvery ==
1409
1455
  kRecordStatsEvery - 1) {
1410
- RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
1411
- c_iter->ResetRecordCounts();
1412
- RecordCompactionIOStats();
1413
-
1414
- uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
1415
- assert(cur_cpu_micros >= last_cpu_micros);
1416
- RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
1417
- cur_cpu_micros - last_cpu_micros);
1418
- last_cpu_micros = cur_cpu_micros;
1456
+ UpdateSubcompactionJobStatsIncrementally(
1457
+ c_iter, &sub_compact->compaction_job_stats,
1458
+ db_options_.clock->CPUMicros(), prev_cpu_micros);
1419
1459
  }
1420
1460
 
1421
1461
  const auto& ikey = c_iter->ikey();
1422
1462
  bool use_proximal_output = ikey.sequence > proximal_after_seqno_;
1463
+
1423
1464
  #ifndef NDEBUG
1424
1465
  if (sub_compact->compaction->SupportsPerKeyPlacement()) {
1425
- // Could be overridden by unittest
1426
1466
  PerKeyPlacementContext context(sub_compact->compaction->output_level(),
1427
1467
  ikey.user_key, c_iter->value(),
1428
1468
  ikey.sequence, use_proximal_output);
@@ -1461,9 +1501,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1461
1501
  static_cast<void*>(const_cast<std::atomic<bool>*>(
1462
1502
  &manual_compaction_canceled_)));
1463
1503
  c_iter->Next();
1464
- if (c_iter->status().IsManualCompactionPaused()) {
1465
- break;
1466
- }
1467
1504
 
1468
1505
  #ifndef NDEBUG
1469
1506
  bool stop = false;
@@ -1475,13 +1512,33 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1475
1512
  #endif // NDEBUG
1476
1513
  }
1477
1514
 
1478
- // This number may not be accurate when CompactionIterator was created
1479
- // with `must_count_input_entries=false`.
1515
+ return status;
1516
+ }
1517
+
1518
+ void CompactionJob::UpdateSubcompactionJobStatsIncrementally(
1519
+ CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats,
1520
+ uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros) {
1521
+ RecordDroppedKeys(c_iter->iter_stats(), compaction_job_stats);
1522
+ c_iter->ResetRecordCounts();
1523
+ RecordCompactionIOStats();
1524
+
1525
+ assert(cur_cpu_micros >= prev_cpu_micros);
1526
+ RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
1527
+ cur_cpu_micros - prev_cpu_micros);
1528
+ prev_cpu_micros = cur_cpu_micros;
1529
+ }
1530
+
1531
+ void CompactionJob::FinalizeSubcompactionJobStats(
1532
+ SubcompactionState* sub_compact, CompactionIterator* c_iter,
1533
+ uint64_t start_cpu_micros, uint64_t prev_cpu_micros,
1534
+ const CompactionIOStatsSnapshot& io_stats) {
1535
+ const CompactionIterationStats& c_iter_stats = c_iter->iter_stats();
1536
+
1480
1537
  assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
1481
1538
  c_iter->HasNumInputEntryScanned());
1482
- sub_compact->compaction_job_stats.has_num_input_records =
1539
+ sub_compact->compaction_job_stats.has_accurate_num_input_records &=
1483
1540
  c_iter->HasNumInputEntryScanned();
1484
- sub_compact->compaction_job_stats.num_input_records =
1541
+ sub_compact->compaction_job_stats.num_input_records +=
1485
1542
  c_iter->NumInputEntryScanned();
1486
1543
  sub_compact->compaction_job_stats.num_blobs_read =
1487
1544
  c_iter_stats.num_blobs_read;
@@ -1512,84 +1569,188 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1512
1569
  c_iter_stats.total_blob_bytes_relocated);
1513
1570
  }
1514
1571
 
1515
- RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
1516
- RecordCompactionIOStats();
1572
+ uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
1573
+
1574
+ // Record final compaction statistics including dropped keys, I/O stats,
1575
+ // and CPU time delta from the last periodic measurement
1576
+ UpdateSubcompactionJobStatsIncrementally(c_iter,
1577
+ &sub_compact->compaction_job_stats,
1578
+ cur_cpu_micros, prev_cpu_micros);
1579
+
1580
+ // Finalize timing and I/O statistics
1581
+ sub_compact->compaction_job_stats.cpu_micros =
1582
+ cur_cpu_micros - start_cpu_micros + sub_compact->GetWorkerCPUMicros();
1517
1583
 
1584
+ if (measure_io_stats_) {
1585
+ sub_compact->compaction_job_stats.file_write_nanos +=
1586
+ IOSTATS(write_nanos) - io_stats.prev_write_nanos;
1587
+ sub_compact->compaction_job_stats.file_fsync_nanos +=
1588
+ IOSTATS(fsync_nanos) - io_stats.prev_fsync_nanos;
1589
+ sub_compact->compaction_job_stats.file_range_sync_nanos +=
1590
+ IOSTATS(range_sync_nanos) - io_stats.prev_range_sync_nanos;
1591
+ sub_compact->compaction_job_stats.file_prepare_write_nanos +=
1592
+ IOSTATS(prepare_write_nanos) - io_stats.prev_prepare_write_nanos;
1593
+ sub_compact->compaction_job_stats.cpu_micros -=
1594
+ (IOSTATS(cpu_write_nanos) - io_stats.prev_cpu_write_nanos +
1595
+ IOSTATS(cpu_read_nanos) - io_stats.prev_cpu_read_nanos) /
1596
+ 1000;
1597
+ if (io_stats.prev_perf_level !=
1598
+ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
1599
+ SetPerfLevel(io_stats.prev_perf_level);
1600
+ }
1601
+ }
1602
+ }
1603
+
1604
+ Status CompactionJob::FinalizeProcessKeyValueStatus(
1605
+ ColumnFamilyData* cfd, InternalIterator* input_iter,
1606
+ CompactionIterator* c_iter, Status status) {
1518
1607
  if (status.ok() && cfd->IsDropped()) {
1519
1608
  status =
1520
1609
  Status::ColumnFamilyDropped("Column family dropped during compaction");
1521
1610
  }
1522
- if ((status.ok() || status.IsColumnFamilyDropped()) &&
1523
- shutting_down_->load(std::memory_order_relaxed)) {
1611
+ if (status.ok() && shutting_down_->load(std::memory_order_relaxed)) {
1524
1612
  status = Status::ShutdownInProgress("Database shutdown");
1525
1613
  }
1526
- if ((status.ok() || status.IsColumnFamilyDropped()) &&
1614
+ if (status.ok() &&
1527
1615
  (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
1528
1616
  status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
1529
1617
  }
1530
1618
  if (status.ok()) {
1531
- status = input->status();
1619
+ status = input_iter->status();
1532
1620
  }
1533
1621
  if (status.ok()) {
1534
1622
  status = c_iter->status();
1535
1623
  }
1536
1624
 
1625
+ return status;
1626
+ }
1627
+
1628
+ Status CompactionJob::CleanupCompactionFiles(
1629
+ SubcompactionState* sub_compact, Status status,
1630
+ const CompactionFileOpenFunc& open_file_func,
1631
+ const CompactionFileCloseFunc& close_file_func) {
1537
1632
  // Call FinishCompactionOutputFile() even if status is not ok: it needs to
1538
1633
  // close the output files. Open file function is also passed, in case there's
1539
1634
  // only range-dels, no file was opened, to save the range-dels, it need to
1540
1635
  // create a new output file.
1541
- status = sub_compact->CloseCompactionFiles(status, open_file_func,
1542
- close_file_func);
1636
+ return sub_compact->CloseCompactionFiles(status, open_file_func,
1637
+ close_file_func);
1638
+ }
1543
1639
 
1640
+ Status CompactionJob::FinalizeBlobFiles(SubcompactionState* sub_compact,
1641
+ BlobFileBuilder* blob_file_builder,
1642
+ Status status) {
1544
1643
  if (blob_file_builder) {
1545
1644
  if (status.ok()) {
1546
1645
  status = blob_file_builder->Finish();
1547
1646
  } else {
1548
1647
  blob_file_builder->Abandon(status);
1549
1648
  }
1550
- blob_file_builder.reset();
1551
1649
  sub_compact->Current().UpdateBlobStats();
1552
1650
  }
1553
1651
 
1554
- uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
1555
- sub_compact->compaction_job_stats.cpu_micros =
1556
- cur_cpu_micros - prev_cpu_micros;
1557
- RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
1558
- cur_cpu_micros - last_cpu_micros);
1652
+ return status;
1653
+ }
1559
1654
 
1560
- if (measure_io_stats_) {
1561
- sub_compact->compaction_job_stats.file_write_nanos +=
1562
- IOSTATS(write_nanos) - prev_write_nanos;
1563
- sub_compact->compaction_job_stats.file_fsync_nanos +=
1564
- IOSTATS(fsync_nanos) - prev_fsync_nanos;
1565
- sub_compact->compaction_job_stats.file_range_sync_nanos +=
1566
- IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
1567
- sub_compact->compaction_job_stats.file_prepare_write_nanos +=
1568
- IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
1569
- sub_compact->compaction_job_stats.cpu_micros -=
1570
- (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
1571
- IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
1572
- 1000;
1573
- if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
1574
- SetPerfLevel(prev_perf_level);
1575
- }
1655
+ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1656
+ assert(sub_compact);
1657
+ assert(sub_compact->compaction);
1658
+
1659
+ if (!ShouldUseLocalCompaction(sub_compact)) {
1660
+ return;
1576
1661
  }
1662
+
1663
+ AutoThreadOperationStageUpdater stage_updater(
1664
+ ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
1665
+
1666
+ const uint64_t start_cpu_micros = db_options_.clock->CPUMicros();
1667
+ uint64_t prev_cpu_micros = start_cpu_micros;
1668
+ const CompactionIOStatsSnapshot io_stats = InitializeIOStats();
1669
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
1670
+ const CompactionFilter* compaction_filter;
1671
+ std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
1672
+ Status filter_status = SetupAndValidateCompactionFilter(
1673
+ sub_compact, cfd->ioptions().compaction_filter, compaction_filter,
1674
+ compaction_filter_from_factory);
1675
+ if (!filter_status.ok()) {
1676
+ sub_compact->status = filter_status;
1677
+ return;
1678
+ }
1679
+
1680
+ NotifyOnSubcompactionBegin(sub_compact);
1681
+
1682
+ SubcompactionKeyBoundaries boundaries(sub_compact->start, sub_compact->end);
1683
+ SubcompactionInternalIterators iterators;
1684
+ ReadOptions read_options;
1685
+ const WriteOptions write_options(Env::IOPriority::IO_LOW,
1686
+ Env::IOActivity::kCompaction);
1687
+ MergeHelper merge(
1688
+ env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
1689
+ compaction_filter, db_options_.info_log.get(),
1690
+ false /* internal key corruption is expected */,
1691
+ job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
1692
+ compact_->compaction->level(), db_options_.stats);
1693
+ BlobFileResources blob_resources;
1694
+
1695
+ InternalIterator* input_iter = CreateInputIterator(
1696
+ sub_compact, cfd, iterators, boundaries, read_options);
1697
+ assert(input_iter);
1698
+ input_iter->SeekToFirst();
1699
+
1700
+ auto c_iter =
1701
+ CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter,
1702
+ merge, blob_resources, write_options);
1703
+ assert(c_iter);
1704
+ c_iter->SeekToFirst();
1705
+
1706
+ TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
1707
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
1708
+ static_cast<void*>(const_cast<std::atomic<bool>*>(
1709
+ &manual_compaction_canceled_)));
1710
+
1711
+ auto [open_file_func, close_file_func] =
1712
+ CreateFileHandlers(sub_compact, boundaries);
1713
+
1714
+ Status status =
1715
+ ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
1716
+ close_file_func, prev_cpu_micros);
1717
+
1718
+ status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status);
1719
+
1720
+ FinalizeSubcompaction(sub_compact, status, open_file_func, close_file_func,
1721
+ blob_resources.blob_file_builder.get(), c_iter.get(),
1722
+ input_iter, start_cpu_micros, prev_cpu_micros,
1723
+ io_stats);
1724
+
1725
+ NotifyOnSubcompactionCompleted(sub_compact);
1726
+ }
1727
+
1728
+ void CompactionJob::FinalizeSubcompaction(
1729
+ SubcompactionState* sub_compact, Status status,
1730
+ const CompactionFileOpenFunc& open_file_func,
1731
+ const CompactionFileCloseFunc& close_file_func,
1732
+ BlobFileBuilder* blob_file_builder, CompactionIterator* c_iter,
1733
+ [[maybe_unused]] InternalIterator* input_iter, uint64_t start_cpu_micros,
1734
+ uint64_t prev_cpu_micros, const CompactionIOStatsSnapshot& io_stats) {
1735
+ status = CleanupCompactionFiles(sub_compact, status, open_file_func,
1736
+ close_file_func);
1737
+ status = FinalizeBlobFiles(sub_compact, blob_file_builder, status);
1738
+
1739
+ FinalizeSubcompactionJobStats(sub_compact, c_iter, start_cpu_micros,
1740
+ prev_cpu_micros, io_stats);
1741
+
1577
1742
  #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
1578
1743
  if (!status.ok()) {
1579
1744
  if (c_iter) {
1580
1745
  c_iter->status().PermitUncheckedError();
1581
1746
  }
1582
- if (input) {
1583
- input->status().PermitUncheckedError();
1747
+ if (input_iter) {
1748
+ input_iter->status().PermitUncheckedError();
1584
1749
  }
1585
1750
  }
1586
1751
  #endif // ROCKSDB_ASSERT_STATUS_CHECKED
1587
1752
 
1588
- blob_counter.reset();
1589
- clip.reset();
1590
- raw_input.reset();
1591
1753
  sub_compact->status = status;
1592
- NotifyOnSubcompactionCompleted(sub_compact);
1593
1754
  }
1594
1755
 
1595
1756
  uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
@@ -2106,7 +2267,7 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
2106
2267
  }
2107
2268
  } // namespace
2108
2269
 
2109
- bool CompactionJob::BuildStatsFromInputTableProperties(
2270
+ bool CompactionJob::UpdateInternalStatsFromInputFiles(
2110
2271
  uint64_t* num_input_range_del) {
2111
2272
  assert(compact_);
2112
2273
 
@@ -2189,7 +2350,7 @@ bool CompactionJob::BuildStatsFromInputTableProperties(
2189
2350
  return !has_error;
2190
2351
  }
2191
2352
 
2192
- void CompactionJob::UpdateCompactionJobInputStats(
2353
+ void CompactionJob::UpdateCompactionJobInputStatsFromInternalStats(
2193
2354
  const InternalStats::CompactionStatsFull& internal_stats,
2194
2355
  uint64_t num_input_range_del) const {
2195
2356
  assert(job_stats_);
@@ -2242,7 +2403,7 @@ void CompactionJob::UpdateCompactionJobInputStats(
2242
2403
  }
2243
2404
  }
2244
2405
 
2245
- void CompactionJob::UpdateCompactionJobOutputStats(
2406
+ void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
2246
2407
  const InternalStats::CompactionStatsFull& internal_stats) const {
2247
2408
  assert(job_stats_);
2248
2409
  job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
@@ -2378,6 +2539,11 @@ Status CompactionJob::VerifyInputRecordCount(
2378
2539
  "number of keys processed. Expected " +
2379
2540
  std::to_string(expected) + " but processed " +
2380
2541
  std::to_string(actual) + ". Compaction summary: " + scratch;
2542
+ ROCKS_LOG_WARN(
2543
+ db_options_.info_log,
2544
+ "[%s] [JOB %d] VerifyInputRecordCount() Status: %s",
2545
+ compact_->compaction->column_family_data()->GetName().c_str(),
2546
+ job_context_->job_id, msg.c_str());
2381
2547
  if (db_options_.compaction_verify_record_count) {
2382
2548
  return Status::Corruption(msg);
2383
2549
  }
@@ -2386,4 +2552,38 @@ Status CompactionJob::VerifyInputRecordCount(
2386
2552
  return Status::OK();
2387
2553
  }
2388
2554
 
2555
+ Status CompactionJob::VerifyOutputRecordCount() const {
2556
+ uint64_t total_output_num = 0;
2557
+ for (const auto& state : compact_->sub_compact_states) {
2558
+ for (const auto& output : state.GetOutputs()) {
2559
+ total_output_num += output.table_properties->num_entries -
2560
+ output.table_properties->num_range_deletions;
2561
+ }
2562
+ }
2563
+
2564
+ uint64_t expected = internal_stats_.output_level_stats.num_output_records;
2565
+ if (internal_stats_.has_proximal_level_output) {
2566
+ expected += internal_stats_.proximal_level_stats.num_output_records;
2567
+ }
2568
+ if (expected != total_output_num) {
2569
+ char scratch[2345];
2570
+ compact_->compaction->Summary(scratch, sizeof(scratch));
2571
+ std::string msg =
2572
+ "Number of keys in compaction output SST files does not match "
2573
+ "number of keys added. Expected " +
2574
+ std::to_string(expected) + " but there are " +
2575
+ std::to_string(total_output_num) +
2576
+ " in output SST files. Compaction summary: " + scratch;
2577
+ ROCKS_LOG_WARN(
2578
+ db_options_.info_log,
2579
+ "[%s] [JOB %d] VerifyOutputRecordCount() status: %s",
2580
+ compact_->compaction->column_family_data()->GetName().c_str(),
2581
+ job_context_->job_id, msg.c_str());
2582
+ if (db_options_.compaction_verify_record_count) {
2583
+ return Status::Corruption(msg);
2584
+ }
2585
+ }
2586
+ return Status::OK();
2587
+ }
2588
+
2389
2589
  } // namespace ROCKSDB_NAMESPACE