@nxtedition/rocksdb 8.2.8 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -1
  2. package/deps/rocksdb/rocksdb/Makefile +22 -19
  3. package/deps/rocksdb/rocksdb/TARGETS +8 -0
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +157 -61
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +43 -92
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +632 -455
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +244 -149
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +41 -13
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +11 -1
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +216 -17
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +7 -5
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +279 -199
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +159 -8
  15. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +28 -2
  16. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +1 -1
  17. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +8 -0
  18. package/deps/rocksdb/rocksdb/crash_test.mk +14 -0
  19. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -1
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +1 -1
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +18 -21
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +1 -2
  26. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +1 -1
  27. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +2 -3
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +1 -1
  29. package/deps/rocksdb/rocksdb/db/builder.cc +32 -7
  30. package/deps/rocksdb/rocksdb/db/c.cc +169 -6
  31. package/deps/rocksdb/rocksdb/db/c_test.c +104 -6
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +98 -47
  33. package/deps/rocksdb/rocksdb/db/column_family.h +25 -2
  34. package/deps/rocksdb/rocksdb/db/column_family_test.cc +213 -2
  35. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +93 -23
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +33 -9
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -6
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +2 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +107 -43
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -4
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -0
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +25 -17
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -4
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -11
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +29 -4
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +24 -31
  50. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +3 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +19 -19
  52. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +2 -1
  53. package/deps/rocksdb/rocksdb/db/convenience.cc +20 -3
  54. package/deps/rocksdb/rocksdb/db/convenience_impl.h +15 -0
  55. package/deps/rocksdb/rocksdb/db/corruption_test.cc +17 -0
  56. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +1 -0
  57. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +17 -3
  58. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +5 -0
  59. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +15 -15
  60. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +666 -44
  61. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +2 -29
  62. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +274 -1
  63. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +40 -19
  64. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +6 -5
  65. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +250 -116
  66. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +51 -23
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +354 -96
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +6 -3
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +2 -1
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +50 -21
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +26 -13
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +13 -5
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +61 -21
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -87
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +7 -1
  77. package/deps/rocksdb/rocksdb/db/db_iter.cc +2 -2
  78. package/deps/rocksdb/rocksdb/db/db_iter.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +4 -11
  80. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +6 -6
  81. package/deps/rocksdb/rocksdb/db/db_options_test.cc +39 -29
  82. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +26 -36
  83. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +106 -0
  84. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +12 -3
  85. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +1 -1
  86. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +1 -0
  87. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +279 -166
  88. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -21
  89. package/deps/rocksdb/rocksdb/db/db_test2.cc +81 -12
  90. package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
  91. package/deps/rocksdb/rocksdb/db/db_test_util.h +40 -0
  92. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +13 -1
  93. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +233 -0
  94. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +143 -0
  95. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +6 -6
  96. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -2
  97. package/deps/rocksdb/rocksdb/db/dbformat.cc +36 -0
  98. package/deps/rocksdb/rocksdb/db/dbformat.h +169 -20
  99. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +129 -0
  100. package/deps/rocksdb/rocksdb/db/error_handler.cc +16 -0
  101. package/deps/rocksdb/rocksdb/db/error_handler.h +6 -3
  102. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  103. package/deps/rocksdb/rocksdb/db/event_helpers.cc +4 -0
  104. package/deps/rocksdb/rocksdb/db/experimental.cc +2 -1
  105. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +4 -4
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +17 -8
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -4
  108. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -4
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +101 -11
  111. package/deps/rocksdb/rocksdb/db/flush_job.h +24 -1
  112. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +88 -11
  113. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +2 -3
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +159 -91
  115. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +19 -10
  116. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +143 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -1
  118. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  119. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -1
  120. package/deps/rocksdb/rocksdb/db/log_reader.h +3 -2
  121. package/deps/rocksdb/rocksdb/db/log_test.cc +17 -21
  122. package/deps/rocksdb/rocksdb/db/log_writer.cc +1 -1
  123. package/deps/rocksdb/rocksdb/db/log_writer.h +3 -2
  124. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +4 -3
  125. package/deps/rocksdb/rocksdb/db/memtable.cc +52 -13
  126. package/deps/rocksdb/rocksdb/db/memtable.h +45 -1
  127. package/deps/rocksdb/rocksdb/db/memtable_list.cc +44 -10
  128. package/deps/rocksdb/rocksdb/db/memtable_list.h +32 -1
  129. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +90 -4
  130. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -2
  131. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +1 -0
  132. package/deps/rocksdb/rocksdb/db/repair.cc +21 -4
  133. package/deps/rocksdb/rocksdb/db/repair_test.cc +143 -2
  134. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -4
  135. package/deps/rocksdb/rocksdb/db/table_cache.cc +44 -35
  136. package/deps/rocksdb/rocksdb/db/table_cache.h +6 -6
  137. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  138. package/deps/rocksdb/rocksdb/db/version_builder.cc +0 -1
  139. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +236 -204
  140. package/deps/rocksdb/rocksdb/db/version_edit.cc +66 -4
  141. package/deps/rocksdb/rocksdb/db/version_edit.h +48 -6
  142. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +80 -8
  143. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +12 -0
  144. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +86 -17
  145. package/deps/rocksdb/rocksdb/db/version_set.cc +136 -41
  146. package/deps/rocksdb/rocksdb/db/version_set.h +28 -7
  147. package/deps/rocksdb/rocksdb/db/version_set_test.cc +25 -15
  148. package/deps/rocksdb/rocksdb/db/write_batch.cc +11 -0
  149. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +3 -0
  150. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +16 -0
  151. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -3
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +2 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +42 -0
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +32 -3
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -0
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +247 -120
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +9 -4
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +13 -6
  159. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +2 -0
  160. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +15 -27
  161. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +264 -69
  162. package/deps/rocksdb/rocksdb/env/env.cc +1 -2
  163. package/deps/rocksdb/rocksdb/env/env_encryption.cc +11 -165
  164. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +0 -17
  165. package/deps/rocksdb/rocksdb/env/env_posix.cc +6 -2
  166. package/deps/rocksdb/rocksdb/env/env_test.cc +86 -2
  167. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  168. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +78 -0
  169. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +34 -0
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -0
  171. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +15 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +52 -43
  173. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +34 -18
  174. package/deps/rocksdb/rocksdb/file/file_util.cc +10 -5
  175. package/deps/rocksdb/rocksdb/file/file_util.h +13 -1
  176. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +724 -79
  177. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +64 -33
  178. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -16
  179. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +23 -12
  180. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +3 -0
  181. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +2 -1
  182. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +153 -88
  183. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +70 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +50 -11
  185. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -0
  186. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +16 -2
  187. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +1 -1
  188. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +55 -8
  189. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +32 -4
  190. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -109
  191. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +90 -13
  192. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +3 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +85 -17
  194. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +13 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +2 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +5 -1
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +21 -2
  198. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +7 -1
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +6 -0
  200. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +5 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +33 -2
  202. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +14 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +33 -2
  204. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +0 -3
  205. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  206. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +3 -0
  207. package/deps/rocksdb/rocksdb/memory/arena_test.cc +18 -11
  208. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +2 -1
  209. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +69 -34
  210. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +16 -1
  211. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +10 -0
  212. package/deps/rocksdb/rocksdb/options/cf_options.cc +19 -0
  213. package/deps/rocksdb/rocksdb/options/cf_options.h +10 -2
  214. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -1
  215. package/deps/rocksdb/rocksdb/options/db_options.cc +7 -0
  216. package/deps/rocksdb/rocksdb/options/db_options.h +1 -0
  217. package/deps/rocksdb/rocksdb/options/options.cc +15 -1
  218. package/deps/rocksdb/rocksdb/options/options_helper.cc +6 -0
  219. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -3
  220. package/deps/rocksdb/rocksdb/options/options_test.cc +8 -0
  221. package/deps/rocksdb/rocksdb/port/mmap.h +20 -0
  222. package/deps/rocksdb/rocksdb/port/stack_trace.cc +27 -12
  223. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -1
  224. package/deps/rocksdb/rocksdb/src.mk +3 -0
  225. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  226. package/deps/rocksdb/rocksdb/table/block_based/block.cc +48 -22
  227. package/deps/rocksdb/rocksdb/table/block_based/block.h +60 -12
  228. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +115 -42
  229. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -5
  230. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +60 -2
  231. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +2 -0
  232. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +62 -44
  233. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +36 -14
  234. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -15
  235. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +219 -51
  236. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +41 -8
  237. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -1
  238. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +50 -21
  239. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +11 -4
  240. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +195 -55
  241. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +31 -16
  243. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +97 -58
  244. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  245. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +6 -0
  246. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +27 -12
  247. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +3 -1
  248. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +114 -70
  249. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +1 -2
  250. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +9 -6
  251. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +15 -3
  252. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +6 -3
  253. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +11 -11
  254. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -0
  255. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +1 -0
  256. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +6 -2
  257. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +1 -2
  258. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +2 -3
  259. package/deps/rocksdb/rocksdb/table/format.cc +175 -33
  260. package/deps/rocksdb/rocksdb/table/format.h +63 -10
  261. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +10 -2
  262. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +12 -4
  263. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -0
  264. package/deps/rocksdb/rocksdb/table/mock_table.cc +8 -3
  265. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +10 -5
  266. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +10 -1
  267. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +1 -2
  268. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -3
  269. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +12 -3
  270. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +26 -1
  271. package/deps/rocksdb/rocksdb/table/table_builder.h +6 -2
  272. package/deps/rocksdb/rocksdb/table/table_properties.cc +6 -0
  273. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -22
  274. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +19 -7
  275. package/deps/rocksdb/rocksdb/test_util/sync_point.h +3 -1
  276. package/deps/rocksdb/rocksdb/test_util/testutil.cc +29 -0
  277. package/deps/rocksdb/rocksdb/test_util/testutil.h +19 -0
  278. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +65 -26
  279. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
  280. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -0
  281. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -0
  282. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +0 -1
  283. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +4 -0
  284. package/deps/rocksdb/rocksdb/unreleased_history/README.txt +73 -0
  285. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +27 -0
  286. package/deps/rocksdb/rocksdb/unreleased_history/behavior_changes/.gitkeep +0 -0
  287. package/deps/rocksdb/rocksdb/unreleased_history/bug_fixes/.gitkeep +0 -0
  288. package/deps/rocksdb/rocksdb/unreleased_history/new_features/.gitkeep +0 -0
  289. package/deps/rocksdb/rocksdb/unreleased_history/performance_improvements/.gitkeep +0 -0
  290. package/deps/rocksdb/rocksdb/unreleased_history/public_api_changes/.gitkeep +0 -0
  291. package/deps/rocksdb/rocksdb/unreleased_history/release.sh +104 -0
  292. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +5 -0
  293. package/deps/rocksdb/rocksdb/util/bloom_impl.h +3 -3
  294. package/deps/rocksdb/rocksdb/util/cast_util.h +14 -0
  295. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -0
  296. package/deps/rocksdb/rocksdb/util/comparator.cc +29 -7
  297. package/deps/rocksdb/rocksdb/util/compression.cc +4 -4
  298. package/deps/rocksdb/rocksdb/util/compression.h +110 -32
  299. package/deps/rocksdb/rocksdb/util/core_local.h +2 -1
  300. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +4 -4
  301. package/deps/rocksdb/rocksdb/util/filelock_test.cc +3 -0
  302. package/deps/rocksdb/rocksdb/util/hash.h +7 -3
  303. package/deps/rocksdb/rocksdb/util/hash_test.cc +44 -0
  304. package/deps/rocksdb/rocksdb/util/math.h +58 -6
  305. package/deps/rocksdb/rocksdb/util/math128.h +29 -7
  306. package/deps/rocksdb/rocksdb/util/mutexlock.h +35 -27
  307. package/deps/rocksdb/rocksdb/util/single_thread_executor.h +1 -0
  308. package/deps/rocksdb/rocksdb/util/stop_watch.h +1 -1
  309. package/deps/rocksdb/rocksdb/util/thread_operation.h +8 -1
  310. package/deps/rocksdb/rocksdb/util/udt_util.cc +343 -0
  311. package/deps/rocksdb/rocksdb/util/udt_util.h +173 -1
  312. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +447 -0
  313. package/deps/rocksdb/rocksdb/util/write_batch_util.cc +25 -0
  314. package/deps/rocksdb/rocksdb/util/write_batch_util.h +80 -0
  315. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -4
  316. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +69 -25
  317. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +7 -6
  318. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +1 -1
  319. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +2 -3
  320. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +6 -11
  321. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -2
  322. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +4 -5
  323. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +1 -1
  324. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +2 -2
  325. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +2 -1
  326. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +1 -2
  328. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +2 -3
  329. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +2 -2
  330. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  331. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +23 -8
  332. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +9 -6
  333. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +37 -12
  334. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +231 -33
  335. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +0 -1
  336. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +76 -20
  337. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +18 -9
  338. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +40 -23
  339. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +13 -12
  340. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +7 -0
  341. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -1
  342. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +41 -11
  343. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +6 -3
  344. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +71 -24
  345. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +19 -4
  346. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +60 -107
  347. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +39 -11
  348. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +6 -3
  349. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +14 -8
  350. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h +1 -1
  351. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +10 -5
  352. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  353. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +1 -1
  354. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +2 -1
  355. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +6 -6
  356. package/deps/rocksdb/rocksdb.gyp +2 -0
  357. package/package.json +1 -1
  358. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  359. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -21,6 +21,7 @@
21
21
  #include "monitoring/thread_status_util.h"
22
22
  #include "test_util/sync_point.h"
23
23
  #include "util/cast_util.h"
24
+ #include "util/coding.h"
24
25
  #include "util/concurrent_task_limiter_impl.h"
25
26
 
26
27
  namespace ROCKSDB_NAMESPACE {
@@ -76,6 +77,40 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
76
77
  return false;
77
78
  }
78
79
 
80
+ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT(
81
+ const FlushRequest& flush_req) {
82
+ mutex_.AssertHeld();
83
+ assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
84
+ ColumnFamilyData* cfd = flush_req.cfd_to_max_mem_id_to_persist.begin()->first;
85
+ uint64_t max_memtable_id =
86
+ flush_req.cfd_to_max_mem_id_to_persist.begin()->second;
87
+ if (cfd->IsDropped() ||
88
+ !cfd->ShouldPostponeFlushToRetainUDT(max_memtable_id)) {
89
+ return false;
90
+ }
91
+ // Check if holding on the flush will cause entering write stall mode.
92
+ // Write stall entered because of the accumulation of write buffers can be
93
+ // alleviated if we continue with the flush instead of postponing it.
94
+ const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
95
+
96
+ // Taking the status of the active Memtable into consideration so that we are
97
+ // not just checking if DB is currently already in write stall mode.
98
+ int mem_to_flush = cfd->mem()->ApproximateMemoryUsageFast() >=
99
+ cfd->mem()->write_buffer_size() / 2
100
+ ? 1
101
+ : 0;
102
+ WriteStallCondition write_stall =
103
+ ColumnFamilyData::GetWriteStallConditionAndCause(
104
+ cfd->imm()->NumNotFlushed() + mem_to_flush, /*num_l0_files=*/0,
105
+ /*num_compaction_needed_bytes=*/0, mutable_cf_options,
106
+ *cfd->ioptions())
107
+ .first;
108
+ if (write_stall != WriteStallCondition::kNormal) {
109
+ return false;
110
+ }
111
+ return true;
112
+ }
113
+
79
114
  IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
80
115
  VersionEdit* synced_wals) {
81
116
  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
@@ -248,6 +283,24 @@ Status DBImpl::FlushMemTableToOutputFile(
248
283
  // If the log sync failed, we do not need to pick memtable. Otherwise,
249
284
  // num_flush_not_started_ needs to be rollback.
250
285
  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
286
+ // Exit a flush due to bg error should not set bg error again.
287
+ bool skip_set_bg_error = false;
288
+ if (s.ok() && !error_handler_.GetBGError().ok() &&
289
+ error_handler_.IsBGWorkStopped() &&
290
+ flush_reason != FlushReason::kErrorRecovery &&
291
+ flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
292
+ // Error recovery in progress, should not pick memtable which excludes
293
+ // them from being picked up by recovery flush.
294
+ // This ensures that when bg error is set, no new flush can pick
295
+ // memtables.
296
+ skip_set_bg_error = true;
297
+ s = error_handler_.GetBGError();
298
+ assert(!s.ok());
299
+ ROCKS_LOG_BUFFER(log_buffer,
300
+ "[JOB %d] Skip flush due to background error %s",
301
+ job_context->job_id, s.ToString().c_str());
302
+ }
303
+
251
304
  if (s.ok()) {
252
305
  flush_job.PickMemTable();
253
306
  need_cancel = true;
@@ -268,7 +321,8 @@ Status DBImpl::FlushMemTableToOutputFile(
268
321
  // is unlocked by the current thread.
269
322
  if (s.ok()) {
270
323
  s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
271
- &switched_to_mempurge);
324
+ &switched_to_mempurge, &skip_set_bg_error,
325
+ &error_handler_);
272
326
  need_cancel = false;
273
327
  }
274
328
 
@@ -309,7 +363,8 @@ Status DBImpl::FlushMemTableToOutputFile(
309
363
  }
310
364
  }
311
365
 
312
- if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
366
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
367
+ !skip_set_bg_error) {
313
368
  if (log_io_s.ok()) {
314
369
  // Error while writing to MANIFEST.
315
370
  // In fact, versions_->io_status() can also be the result of renaming
@@ -521,6 +576,21 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
521
576
  pick_status.push_back(false);
522
577
  }
523
578
 
579
+ bool flush_for_recovery =
580
+ bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery ||
581
+ bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecoveryRetryFlush;
582
+ bool skip_set_bg_error = false;
583
+
584
+ if (s.ok() && !error_handler_.GetBGError().ok() &&
585
+ error_handler_.IsBGWorkStopped() && !flush_for_recovery) {
586
+ s = error_handler_.GetBGError();
587
+ skip_set_bg_error = true;
588
+ assert(!s.ok());
589
+ ROCKS_LOG_BUFFER(log_buffer,
590
+ "[JOB %d] Skip flush due to background error %s",
591
+ job_context->job_id, s.ToString().c_str());
592
+ }
593
+
524
594
  if (s.ok()) {
525
595
  for (int i = 0; i != num_cfs; ++i) {
526
596
  jobs[i]->PickMemTable();
@@ -585,7 +655,10 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
585
655
  }
586
656
  }
587
657
  }
588
- } else {
658
+ } else if (!skip_set_bg_error) {
659
+ // When `skip_set_bg_error` is true, no memtable is picked so
660
+ // there is no need to call Cancel() or RollbackMemtableFlush().
661
+ //
589
662
  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
590
663
  // it is not because of CF drop.
591
664
  // Have to cancel the flush jobs that have NOT executed because we need to
@@ -598,8 +671,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
598
671
  for (int i = 0; i != num_cfs; ++i) {
599
672
  if (exec_status[i].second.ok() && exec_status[i].first) {
600
673
  auto& mems = jobs[i]->GetMemTables();
601
- cfds[i]->imm()->RollbackMemtableFlush(mems,
602
- file_meta[i].fd.GetNumber());
674
+ cfds[i]->imm()->RollbackMemtableFlush(
675
+ mems, /*rollback_succeeding_memtables=*/false);
603
676
  }
604
677
  }
605
678
  }
@@ -641,10 +714,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
641
714
  };
642
715
 
643
716
  bool resuming_from_bg_err =
644
- error_handler_.IsDBStopped() ||
645
- (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery ||
646
- bg_flush_args[0].flush_reason_ ==
647
- FlushReason::kErrorRecoveryRetryFlush);
717
+ error_handler_.IsDBStopped() || flush_for_recovery;
648
718
  while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) {
649
719
  std::pair<Status, bool> res = wait_to_install_func();
650
720
 
@@ -655,15 +725,27 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
655
725
  s = res.first;
656
726
  break;
657
727
  } else if (!res.second) {
728
+ // we are the oldest immutable memtable
729
+ break;
730
+ }
731
+ // We are not the oldest immutable memtable
732
+ TEST_SYNC_POINT_CALLBACK(
733
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitCV", &res);
734
+ //
735
+ // If bg work is stopped, recovery thread first calls
736
+ // WaitForBackgroundWork() before proceeding to flush for recovery. This
737
+ // flush can block WaitForBackgroundWork() while waiting for recovery
738
+ // flush to install result. To avoid this deadlock, we should abort here
739
+ // if there is background error.
740
+ if (!flush_for_recovery && error_handler_.IsBGWorkStopped() &&
741
+ !error_handler_.GetBGError().ok()) {
742
+ s = error_handler_.GetBGError();
743
+ assert(!s.ok());
658
744
  break;
659
745
  }
660
746
  atomic_flush_install_cv_.Wait();
661
747
 
662
- resuming_from_bg_err =
663
- error_handler_.IsDBStopped() ||
664
- (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery ||
665
- bg_flush_args[0].flush_reason_ ==
666
- FlushReason::kErrorRecoveryRetryFlush);
748
+ resuming_from_bg_err = error_handler_.IsDBStopped() || flush_for_recovery;
667
749
  }
668
750
 
669
751
  if (!resuming_from_bg_err) {
@@ -679,6 +761,17 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
679
761
  // installation.
680
762
  s = error_handler_.GetRecoveryError();
681
763
  }
764
+ // Since we are not installing these memtables, need to rollback
765
+ // to allow future flush job to pick up these memtables.
766
+ if (!s.ok()) {
767
+ for (int i = 0; i != num_cfs; ++i) {
768
+ assert(exec_status[i].first);
769
+ assert(exec_status[i].second.ok());
770
+ auto& mems = jobs[i]->GetMemTables();
771
+ cfds[i]->imm()->RollbackMemtableFlush(
772
+ mems, /*rollback_succeeding_memtables=*/false);
773
+ }
774
+ }
682
775
  }
683
776
 
684
777
  if (s.ok()) {
@@ -782,7 +875,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
782
875
 
783
876
  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
784
877
  // it is not because of CF drop.
785
- if (!s.ok() && !s.IsColumnFamilyDropped()) {
878
+ if (!s.ok() && !s.IsColumnFamilyDropped() && !skip_set_bg_error) {
786
879
  if (log_io_s.ok()) {
787
880
  // Error while writing to MANIFEST.
788
881
  // In fact, versions_->io_status() can also be the result of renaming
@@ -852,8 +945,8 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
852
945
  }
853
946
  }
854
947
  mutex_.Lock();
855
- // no need to signal bg_cv_ as it will be signaled at the end of the
856
- // flush process.
948
+ // no need to signal bg_cv_ as it will be signaled at the end of the
949
+ // flush process.
857
950
  }
858
951
 
859
952
  void DBImpl::NotifyOnFlushCompleted(
@@ -1066,7 +1159,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1066
1159
  std::numeric_limits<uint64_t>::max(), trim_ts);
1067
1160
  } else {
1068
1161
  int first_overlapped_level = kInvalidLevel;
1069
- int max_overlapped_level = kInvalidLevel;
1070
1162
  {
1071
1163
  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
1072
1164
  Version* current_version = super_version->current;
@@ -1142,10 +1234,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1142
1234
  begin, end);
1143
1235
  }
1144
1236
  if (overlap) {
1145
- if (first_overlapped_level == kInvalidLevel) {
1146
- first_overlapped_level = level;
1147
- }
1148
- max_overlapped_level = level;
1237
+ first_overlapped_level = level;
1238
+ break;
1149
1239
  }
1150
1240
  }
1151
1241
  CleanupSuperVersion(super_version);
@@ -1159,7 +1249,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1159
1249
  end, exclusive, true /* disallow_trivial_move */,
1160
1250
  std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
1161
1251
  trim_ts);
1162
- final_output_level = max_overlapped_level;
1252
+ final_output_level = first_overlapped_level;
1163
1253
  } else {
1164
1254
  assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel);
1165
1255
  uint64_t next_file_number = versions_->current_next_file_number();
@@ -1171,7 +1261,29 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1171
1261
  int level = first_overlapped_level;
1172
1262
  final_output_level = level;
1173
1263
  int output_level = 0, base_level = 0;
1174
- while (level < max_overlapped_level || level == 0) {
1264
+ for (;;) {
1265
+ // Always allow L0 -> L1 compaction
1266
+ if (level > 0) {
1267
+ if (cfd->ioptions()->level_compaction_dynamic_level_bytes) {
1268
+ assert(final_output_level < cfd->ioptions()->num_levels);
1269
+ if (final_output_level + 1 == cfd->ioptions()->num_levels) {
1270
+ break;
1271
+ }
1272
+ } else {
1273
+ // TODO(cbi): there is still a race condition here where
1274
+ // if a background compaction compacts some file beyond
1275
+ // current()->storage_info()->num_non_empty_levels() right after
1276
+ // the check here.This should happen very infrequently and should
1277
+ // not happen once a user populates the last level of the LSM.
1278
+ InstrumentedMutexLock l(&mutex_);
1279
+ // num_non_empty_levels may be lower after a compaction, so
1280
+ // we check for >= here.
1281
+ if (final_output_level + 1 >=
1282
+ cfd->current()->storage_info()->num_non_empty_levels()) {
1283
+ break;
1284
+ }
1285
+ }
1286
+ }
1175
1287
  output_level = level + 1;
1176
1288
  if (cfd->ioptions()->level_compaction_dynamic_level_bytes &&
1177
1289
  level == 0) {
@@ -1203,17 +1315,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1203
1315
  if (s.ok()) {
1204
1316
  assert(final_output_level > 0);
1205
1317
  // bottommost level intra-level compaction
1206
- // TODO(cbi): this preserves earlier behavior where if
1207
- // max_overlapped_level = 0 and bottommost_level_compaction is
1208
- // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction
1209
- // and do not do intra-LBase compaction even when user configures
1210
- // compaction filter. We may want to still do a LBase -> LBase
1211
- // compaction in case there is some file in LBase that did not go
1212
- // through L0 -> LBase compaction, and hence did not go through
1213
- // compaction filter.
1214
1318
  if ((options.bottommost_level_compaction ==
1215
1319
  BottommostLevelCompaction::kIfHaveCompactionFilter &&
1216
- max_overlapped_level != 0 &&
1217
1320
  (cfd->ioptions()->compaction_filter != nullptr ||
1218
1321
  cfd->ioptions()->compaction_filter_factory != nullptr)) ||
1219
1322
  options.bottommost_level_compaction ==
@@ -1221,10 +1324,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1221
1324
  options.bottommost_level_compaction ==
1222
1325
  BottommostLevelCompaction::kForce) {
1223
1326
  // Use `next_file_number` as `max_file_num_to_ignore` to avoid
1224
- // rewriting newly compacted files when it is kForceOptimized.
1327
+ // rewriting newly compacted files when it is kForceOptimized
1328
+ // or kIfHaveCompactionFilter with compaction filter set.
1225
1329
  s = RunManualCompaction(
1226
1330
  cfd, final_output_level, final_output_level, options, begin,
1227
- end, exclusive, !trim_ts.empty() /* disallow_trivial_move */,
1331
+ end, exclusive, true /* disallow_trivial_move */,
1228
1332
  next_file_number /* max_file_num_to_ignore */, trim_ts);
1229
1333
  }
1230
1334
  }
@@ -1375,6 +1479,14 @@ Status DBImpl::CompactFilesImpl(
1375
1479
  }
1376
1480
  }
1377
1481
 
1482
+ if (cfd->ioptions()->allow_ingest_behind &&
1483
+ output_level >= cfd->ioptions()->num_levels - 1) {
1484
+ return Status::InvalidArgument(
1485
+ "Exceed the maximum output level defined by "
1486
+ "the current compaction algorithm with ingest_behind --- " +
1487
+ std::to_string(cfd->ioptions()->num_levels - 1));
1488
+ }
1489
+
1378
1490
  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
1379
1491
  &input_set, cf_meta, output_level);
1380
1492
  TEST_SYNC_POINT("DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles");
@@ -1492,7 +1604,7 @@ Status DBImpl::CompactFilesImpl(
1492
1604
 
1493
1605
  if (compaction_job_info != nullptr) {
1494
1606
  BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
1495
- job_context->job_id, version, compaction_job_info);
1607
+ job_context->job_id, compaction_job_info);
1496
1608
  }
1497
1609
 
1498
1610
  if (status.ok()) {
@@ -1589,21 +1701,18 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
1589
1701
  }
1590
1702
 
1591
1703
  c->SetNotifyOnCompactionCompleted();
1592
- Version* current = cfd->current();
1593
- current->Ref();
1594
1704
  // release lock while notifying events
1595
1705
  mutex_.Unlock();
1596
1706
  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
1597
1707
  {
1598
1708
  CompactionJobInfo info{};
1599
- BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info);
1709
+ BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info);
1600
1710
  for (auto listener : immutable_db_options_.listeners) {
1601
1711
  listener->OnCompactionBegin(this, info);
1602
1712
  }
1603
1713
  info.status.PermitUncheckedError();
1604
1714
  }
1605
1715
  mutex_.Lock();
1606
- current->Unref();
1607
1716
  }
1608
1717
 
1609
1718
  void DBImpl::NotifyOnCompactionCompleted(
@@ -1621,21 +1730,17 @@ void DBImpl::NotifyOnCompactionCompleted(
1621
1730
  return;
1622
1731
  }
1623
1732
 
1624
- Version* current = cfd->current();
1625
- current->Ref();
1626
1733
  // release lock while notifying events
1627
1734
  mutex_.Unlock();
1628
1735
  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
1629
1736
  {
1630
1737
  CompactionJobInfo info{};
1631
- BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
1632
- &info);
1738
+ BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info);
1633
1739
  for (auto listener : immutable_db_options_.listeners) {
1634
1740
  listener->OnCompactionCompleted(this, info);
1635
1741
  }
1636
1742
  }
1637
1743
  mutex_.Lock();
1638
- current->Unref();
1639
1744
  // no need to signal bg_cv_ as it will be signaled at the end of the
1640
1745
  // flush process.
1641
1746
  }
@@ -1758,7 +1863,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
1758
1863
  f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
1759
1864
  f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
1760
1865
  f->file_checksum, f->file_checksum_func_name, f->unique_id,
1761
- f->compensated_range_deletion_size, f->tail_size);
1866
+ f->compensated_range_deletion_size, f->tail_size,
1867
+ f->user_defined_timestamps_persisted);
1762
1868
  }
1763
1869
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
1764
1870
  "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -1808,6 +1914,37 @@ int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
1808
1914
  ->mutable_cf_options.level0_stop_writes_trigger;
1809
1915
  }
1810
1916
 
1917
+ Status DBImpl::FlushAllColumnFamilies(const FlushOptions& flush_options,
1918
+ FlushReason flush_reason) {
1919
+ mutex_.AssertHeld();
1920
+ Status status;
1921
+ if (immutable_db_options_.atomic_flush) {
1922
+ mutex_.Unlock();
1923
+ status = AtomicFlushMemTables(flush_options, flush_reason);
1924
+ if (status.IsColumnFamilyDropped()) {
1925
+ status = Status::OK();
1926
+ }
1927
+ mutex_.Lock();
1928
+ } else {
1929
+ for (auto cfd : versions_->GetRefedColumnFamilySet()) {
1930
+ if (cfd->IsDropped()) {
1931
+ continue;
1932
+ }
1933
+ mutex_.Unlock();
1934
+ status = FlushMemTable(cfd, flush_options, flush_reason);
1935
+ TEST_SYNC_POINT("DBImpl::FlushAllColumnFamilies:1");
1936
+ TEST_SYNC_POINT("DBImpl::FlushAllColumnFamilies:2");
1937
+ mutex_.Lock();
1938
+ if (!status.ok() && !status.IsColumnFamilyDropped()) {
1939
+ break;
1940
+ } else if (status.IsColumnFamilyDropped()) {
1941
+ status = Status::OK();
1942
+ }
1943
+ }
1944
+ }
1945
+ return status;
1946
+ }
1947
+
1811
1948
  Status DBImpl::Flush(const FlushOptions& flush_options,
1812
1949
  ColumnFamilyHandle* column_family) {
1813
1950
  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
@@ -2144,9 +2281,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2144
2281
  WaitForPendingWrites();
2145
2282
 
2146
2283
  if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
2284
+ flush_reason != FlushReason::kCatchUpAfterErrorRecovery &&
2147
2285
  (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
2148
2286
  // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
2149
2287
  // auto retry resume, we want to avoid creating new small memtables.
2288
+ // If flush reason is kCatchUpAfterErrorRecovery, we try to flush any new
2289
+ // memtable that filled up during recovery, and we also want to avoid
2290
+ // switching memtable to create small memtables.
2150
2291
  // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
2151
2292
  // will iterate through all the CFs and call FlushMemtable during auto
2152
2293
  // retry resume, it is possible that in some CFs,
@@ -2337,7 +2478,8 @@ Status DBImpl::AtomicFlushMemTables(
2337
2478
 
2338
2479
  for (auto cfd : cfds) {
2339
2480
  if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
2340
- flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
2481
+ flush_reason == FlushReason::kErrorRecoveryRetryFlush ||
2482
+ flush_reason == FlushReason::kCatchUpAfterErrorRecovery) {
2341
2483
  continue;
2342
2484
  }
2343
2485
  cfd->Ref();
@@ -2455,8 +2597,11 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
2455
2597
  // check whether one extra immutable memtable or an extra L0 file would
2456
2598
  // cause write stalling mode to be entered. It could still enter stall
2457
2599
  // mode due to pending compaction bytes, but that's less common
2600
+ // No extra immutable Memtable will be created if the current Memtable is
2601
+ // empty.
2602
+ int mem_to_flush = cfd->mem()->IsEmpty() ? 0 : 1;
2458
2603
  write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
2459
- cfd->imm()->NumNotFlushed() + 1,
2604
+ cfd->imm()->NumNotFlushed() + mem_to_flush,
2460
2605
  vstorage->l0_delay_trigger_count() + 1,
2461
2606
  vstorage->estimated_compaction_needed_bytes(),
2462
2607
  mutable_cf_options, *cfd->ioptions())
@@ -2602,6 +2747,11 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
2602
2747
  // There has been a hard error and this call is not part of the recovery
2603
2748
  // sequence. Bail out here so we don't get into an endless loop of
2604
2749
  // scheduling BG work which will again call this function
2750
+ //
2751
+ // Note that a non-recovery flush can still be scheduled if
2752
+ // error_handler_.IsRecoveryInProgress() returns true. We rely on
2753
+ // BackgroundCallFlush() to check flush reason and drop non-recovery
2754
+ // flushes.
2605
2755
  return;
2606
2756
  } else if (shutting_down_.load(std::memory_order_acquire)) {
2607
2757
  // DB is being deleted; no more background compactions
@@ -2612,6 +2762,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
2612
2762
  env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
2613
2763
  while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
2614
2764
  bg_flush_scheduled_ < bg_job_limits.max_flushes) {
2765
+ TEST_SYNC_POINT_CALLBACK(
2766
+ "DBImpl::MaybeScheduleFlushOrCompaction:BeforeSchedule",
2767
+ &unscheduled_flushes_);
2615
2768
  bg_flush_scheduled_++;
2616
2769
  FlushThreadArg* fta = new FlushThreadArg;
2617
2770
  fta->db_ = this;
@@ -2721,7 +2874,7 @@ ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
2721
2874
 
2722
2875
  DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
2723
2876
  assert(!flush_queue_.empty());
2724
- FlushRequest flush_req = flush_queue_.front();
2877
+ FlushRequest flush_req = std::move(flush_queue_.front());
2725
2878
  flush_queue_.pop_front();
2726
2879
  if (!immutable_db_options_.atomic_flush) {
2727
2880
  assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
@@ -2765,6 +2918,9 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
2765
2918
 
2766
2919
  void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
2767
2920
  mutex_.AssertHeld();
2921
+ if (reject_new_background_jobs_) {
2922
+ return;
2923
+ }
2768
2924
  if (flush_req.cfd_to_max_mem_id_to_persist.empty()) {
2769
2925
  return;
2770
2926
  }
@@ -2794,6 +2950,9 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
2794
2950
 
2795
2951
  void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
2796
2952
  mutex_.AssertHeld();
2953
+ if (reject_new_background_jobs_) {
2954
+ return;
2955
+ }
2797
2956
  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
2798
2957
  AddToCompactionQueue(cfd);
2799
2958
  ++unscheduled_compactions_;
@@ -2803,6 +2962,9 @@ void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
2803
2962
  void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
2804
2963
  FileType type, uint64_t number, int job_id) {
2805
2964
  mutex_.AssertHeld();
2965
+ if (reject_new_background_jobs_) {
2966
+ return;
2967
+ }
2806
2968
  PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
2807
2969
  purge_files_.insert({{number, std::move(file_info)}});
2808
2970
  }
@@ -2891,6 +3053,7 @@ void DBImpl::UnscheduleFlushCallback(void* arg) {
2891
3053
 
2892
3054
  Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
2893
3055
  LogBuffer* log_buffer, FlushReason* reason,
3056
+ bool* flush_rescheduled_to_retain_udt,
2894
3057
  Env::Priority thread_pri) {
2895
3058
  mutex_.AssertHeld();
2896
3059
 
@@ -2916,14 +3079,61 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
2916
3079
  autovector<ColumnFamilyData*> column_families_not_to_flush;
2917
3080
  while (!flush_queue_.empty()) {
2918
3081
  // This cfd is already referenced
2919
- const FlushRequest& flush_req = PopFirstFromFlushQueue();
3082
+ FlushRequest flush_req = PopFirstFromFlushQueue();
2920
3083
  FlushReason flush_reason = flush_req.flush_reason;
3084
+ if (!error_handler_.GetBGError().ok() && error_handler_.IsBGWorkStopped() &&
3085
+ flush_reason != FlushReason::kErrorRecovery &&
3086
+ flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
3087
+ // Stop non-recovery flush when bg work is stopped
3088
+ // Note that we drop the flush request here.
3089
+ // Recovery thread should schedule further flushes after bg error
3090
+ // is cleared.
3091
+ status = error_handler_.GetBGError();
3092
+ assert(!status.ok());
3093
+ ROCKS_LOG_BUFFER(log_buffer,
3094
+ "[JOB %d] Abort flush due to background error %s",
3095
+ job_context->job_id, status.ToString().c_str());
3096
+ *reason = flush_reason;
3097
+ for (auto item : flush_req.cfd_to_max_mem_id_to_persist) {
3098
+ item.first->UnrefAndTryDelete();
3099
+ }
3100
+ return status;
3101
+ }
3102
+ if (!immutable_db_options_.atomic_flush &&
3103
+ ShouldRescheduleFlushRequestToRetainUDT(flush_req)) {
3104
+ assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
3105
+ ColumnFamilyData* cfd =
3106
+ flush_req.cfd_to_max_mem_id_to_persist.begin()->first;
3107
+ if (cfd->UnrefAndTryDelete()) {
3108
+ return Status::OK();
3109
+ }
3110
+ ROCKS_LOG_BUFFER(log_buffer,
3111
+ "FlushRequest for column family %s is re-scheduled to "
3112
+ "retain user-defined timestamps.",
3113
+ cfd->GetName().c_str());
3114
+ // Reschedule the `FlushRequest` as is without checking dropped column
3115
+ // family etc. The follow-up job will do the check anyways, so save the
3116
+ // duplication. Column family is deduplicated by `SchdulePendingFlush` and
3117
+ // `PopFirstFromFlushQueue` contains at flush request enqueueing and
3118
+ // dequeueing time.
3119
+ // This flush request is rescheduled right after it's popped from the
3120
+ // queue while the db mutex is held, so there should be no other
3121
+ // FlushRequest for the same column family with higher `max_memtable_id`
3122
+ // in the queue to block the reschedule from succeeding.
3123
+ #ifndef NDEBUG
3124
+ flush_req.reschedule_count += 1;
3125
+ #endif /* !NDEBUG */
3126
+ SchedulePendingFlush(flush_req);
3127
+ *reason = flush_reason;
3128
+ *flush_rescheduled_to_retain_udt = true;
3129
+ return Status::TryAgain();
3130
+ }
2921
3131
  superversion_contexts.clear();
2922
3132
  superversion_contexts.reserve(
2923
3133
  flush_req.cfd_to_max_mem_id_to_persist.size());
2924
3134
 
2925
- for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
2926
- ColumnFamilyData* cfd = iter.first;
3135
+ for (const auto& [cfd, max_memtable_id] :
3136
+ flush_req.cfd_to_max_mem_id_to_persist) {
2927
3137
  if (cfd->GetMempurgeUsed()) {
2928
3138
  // If imm() contains silent memtables (e.g.: because
2929
3139
  // MemPurge was activated), requesting a flush will
@@ -2937,10 +3147,16 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
2937
3147
  continue;
2938
3148
  }
2939
3149
  superversion_contexts.emplace_back(SuperVersionContext(true));
2940
- bg_flush_args.emplace_back(cfd, iter.second,
3150
+ bg_flush_args.emplace_back(cfd, max_memtable_id,
2941
3151
  &(superversion_contexts.back()), flush_reason);
2942
3152
  }
2943
- if (!bg_flush_args.empty()) {
3153
+ // `MaybeScheduleFlushOrCompaction` schedules as many `BackgroundCallFlush`
3154
+ // jobs as the number of `FlushRequest` in the `flush_queue_`, a.k.a
3155
+ // `unscheduled_flushes_`. So it's sufficient to make each `BackgroundFlush`
3156
+ // handle one `FlushRequest` and each have a Status returned.
3157
+ if (!bg_flush_args.empty() || !column_families_not_to_flush.empty()) {
3158
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundFlush:CheckFlushRequest:cb",
3159
+ const_cast<int*>(&flush_req.reschedule_count));
2944
3160
  break;
2945
3161
  }
2946
3162
  }
@@ -3002,11 +3218,20 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
3002
3218
  pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
3003
3219
  CaptureCurrentFileNumberInPendingOutputs()));
3004
3220
  FlushReason reason;
3005
-
3006
- Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
3007
- &reason, thread_pri);
3008
- if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
3009
- reason != FlushReason::kErrorRecovery) {
3221
+ bool flush_rescheduled_to_retain_udt = false;
3222
+ Status s =
3223
+ BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason,
3224
+ &flush_rescheduled_to_retain_udt, thread_pri);
3225
+ if (s.IsTryAgain() && flush_rescheduled_to_retain_udt) {
3226
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
3227
+ mutex_.Unlock();
3228
+ TEST_SYNC_POINT_CALLBACK("DBImpl::AfterRetainUDTReschedule:cb", nullptr);
3229
+ immutable_db_options_.clock->SleepForMicroseconds(
3230
+ 100000); // prevent hot loop
3231
+ mutex_.Lock();
3232
+ } else if (!s.ok() && !s.IsShutdownInProgress() &&
3233
+ !s.IsColumnFamilyDropped() &&
3234
+ reason != FlushReason::kErrorRecovery) {
3010
3235
  // Wait a little bit before retrying background flush in
3011
3236
  // case this is an environmental problem and we do not want to
3012
3237
  // chew up resources for failed flushes for the duration of
@@ -3016,9 +3241,9 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
3016
3241
  bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
3017
3242
  mutex_.Unlock();
3018
3243
  ROCKS_LOG_ERROR(immutable_db_options_.info_log,
3019
- "Waiting after background flush error: %s"
3244
+ "[JOB %d] Waiting after background flush error: %s"
3020
3245
  "Accumulated background error counts: %" PRIu64,
3021
- s.ToString().c_str(), error_cnt);
3246
+ job_context.job_id, s.ToString().c_str(), error_cnt);
3022
3247
  log_buffer.FlushBufferToLog();
3023
3248
  LogFlush(immutable_db_options_.info_log);
3024
3249
  immutable_db_options_.clock->SleepForMicroseconds(1000000);
@@ -3027,29 +3252,33 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
3027
3252
 
3028
3253
  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
3029
3254
  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
3030
-
3031
- // If flush failed, we want to delete all temporary files that we might have
3032
- // created. Thus, we force full scan in FindObsoleteFiles()
3033
- FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
3034
- !s.IsColumnFamilyDropped());
3035
- // delete unnecessary files if any, this is done outside the mutex
3036
- if (job_context.HaveSomethingToClean() ||
3037
- job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
3038
- mutex_.Unlock();
3039
- TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
3040
- // Have to flush the info logs before bg_flush_scheduled_--
3041
- // because if bg_flush_scheduled_ becomes 0 and the lock is
3042
- // released, the deconstructor of DB can kick in and destroy all the
3043
- // states of DB so info_log might not be available after that point.
3044
- // It also applies to access other states that DB owns.
3045
- log_buffer.FlushBufferToLog();
3046
- if (job_context.HaveSomethingToDelete()) {
3047
- PurgeObsoleteFiles(job_context);
3255
+ // There is no need to do these clean up if the flush job is rescheduled
3256
+ // to retain user-defined timestamps because the job doesn't get to the
3257
+ // stage of actually flushing the MemTables.
3258
+ if (!flush_rescheduled_to_retain_udt) {
3259
+ // If flush failed, we want to delete all temporary files that we might
3260
+ // have created. Thus, we force full scan in FindObsoleteFiles()
3261
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
3262
+ !s.IsColumnFamilyDropped());
3263
+ // delete unnecessary files if any, this is done outside the mutex
3264
+ if (job_context.HaveSomethingToClean() ||
3265
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
3266
+ mutex_.Unlock();
3267
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
3268
+ // Have to flush the info logs before bg_flush_scheduled_--
3269
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
3270
+ // released, the deconstructor of DB can kick in and destroy all the
3271
+ // states of DB so info_log might not be available after that point.
3272
+ // It also applies to access other states that DB owns.
3273
+ log_buffer.FlushBufferToLog();
3274
+ if (job_context.HaveSomethingToDelete()) {
3275
+ PurgeObsoleteFiles(job_context);
3276
+ }
3277
+ job_context.Clean();
3278
+ mutex_.Lock();
3048
3279
  }
3049
- job_context.Clean();
3050
- mutex_.Lock();
3280
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
3051
3281
  }
3052
- TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
3053
3282
 
3054
3283
  assert(num_running_flushes_ > 0);
3055
3284
  num_running_flushes_--;
@@ -3457,7 +3686,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3457
3686
  f->oldest_blob_file_number, f->oldest_ancester_time,
3458
3687
  f->file_creation_time, f->epoch_number, f->file_checksum,
3459
3688
  f->file_checksum_func_name, f->unique_id,
3460
- f->compensated_range_deletion_size, f->tail_size);
3689
+ f->compensated_range_deletion_size, f->tail_size,
3690
+ f->user_defined_timestamps_persisted);
3461
3691
 
3462
3692
  ROCKS_LOG_BUFFER(
3463
3693
  log_buffer,
@@ -3781,7 +4011,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
3781
4011
  void DBImpl::BuildCompactionJobInfo(
3782
4012
  const ColumnFamilyData* cfd, Compaction* c, const Status& st,
3783
4013
  const CompactionJobStats& compaction_job_stats, const int job_id,
3784
- const Version* current, CompactionJobInfo* compaction_job_info) const {
4014
+ CompactionJobInfo* compaction_job_info) const {
3785
4015
  assert(compaction_job_info != nullptr);
3786
4016
  compaction_job_info->cf_id = cfd->GetID();
3787
4017
  compaction_job_info->cf_name = cfd->GetName();
@@ -3791,7 +4021,7 @@ void DBImpl::BuildCompactionJobInfo(
3791
4021
  compaction_job_info->base_input_level = c->start_level();
3792
4022
  compaction_job_info->output_level = c->output_level();
3793
4023
  compaction_job_info->stats = compaction_job_stats;
3794
- compaction_job_info->table_properties = c->GetOutputTableProperties();
4024
+ compaction_job_info->table_properties = c->GetTableProperties();
3795
4025
  compaction_job_info->compaction_reason = c->compaction_reason();
3796
4026
  compaction_job_info->compression = c->output_compression();
3797
4027
 
@@ -3805,15 +4035,9 @@ void DBImpl::BuildCompactionJobInfo(
3805
4035
  compaction_job_info->input_files.push_back(fn);
3806
4036
  compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
3807
4037
  static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
3808
- if (compaction_job_info->table_properties.count(fn) == 0) {
3809
- std::shared_ptr<const TableProperties> tp;
3810
- auto s = current->GetTableProperties(read_options, &tp, fmd, &fn);
3811
- if (s.ok()) {
3812
- compaction_job_info->table_properties[fn] = tp;
3813
- }
3814
- }
3815
4038
  }
3816
4039
  }
4040
+
3817
4041
  for (const auto& newf : c->edit()->GetNewFiles()) {
3818
4042
  const FileMetaData& meta = newf.second;
3819
4043
  const FileDescriptor& desc = meta.fd;
@@ -3957,20 +4181,54 @@ void DBImpl::GetSnapshotContext(
3957
4181
  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
3958
4182
  }
3959
4183
 
3960
- Status DBImpl::WaitForCompact(bool abort_on_pause) {
4184
+ Status DBImpl::WaitForCompact(
4185
+ const WaitForCompactOptions& wait_for_compact_options) {
3961
4186
  InstrumentedMutexLock l(&mutex_);
4187
+ if (wait_for_compact_options.flush) {
4188
+ Status s = DBImpl::FlushAllColumnFamilies(FlushOptions(),
4189
+ FlushReason::kManualFlush);
4190
+ if (!s.ok()) {
4191
+ return s;
4192
+ }
4193
+ } else if (wait_for_compact_options.close_db &&
4194
+ has_unpersisted_data_.load(std::memory_order_relaxed) &&
4195
+ !mutable_db_options_.avoid_flush_during_shutdown) {
4196
+ Status s =
4197
+ DBImpl::FlushAllColumnFamilies(FlushOptions(), FlushReason::kShutDown);
4198
+ if (!s.ok()) {
4199
+ return s;
4200
+ }
4201
+ }
4202
+ TEST_SYNC_POINT("DBImpl::WaitForCompact:StartWaiting");
4203
+ const auto deadline = immutable_db_options_.clock->NowMicros() +
4204
+ wait_for_compact_options.timeout.count();
3962
4205
  for (;;) {
3963
4206
  if (shutting_down_.load(std::memory_order_acquire)) {
3964
4207
  return Status::ShutdownInProgress();
3965
4208
  }
3966
- if (bg_work_paused_ && abort_on_pause) {
4209
+ if (bg_work_paused_ && wait_for_compact_options.abort_on_pause) {
3967
4210
  return Status::Aborted();
3968
4211
  }
3969
4212
  if ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
3970
4213
  bg_flush_scheduled_ || unscheduled_compactions_ ||
3971
- unscheduled_flushes_) &&
4214
+ unscheduled_flushes_ || error_handler_.IsRecoveryInProgress()) &&
3972
4215
  (error_handler_.GetBGError().ok())) {
3973
- bg_cv_.Wait();
4216
+ if (wait_for_compact_options.timeout.count()) {
4217
+ if (bg_cv_.TimedWait(deadline)) {
4218
+ return Status::TimedOut();
4219
+ }
4220
+ } else {
4221
+ bg_cv_.Wait();
4222
+ }
4223
+ } else if (wait_for_compact_options.close_db) {
4224
+ reject_new_background_jobs_ = true;
4225
+ mutex_.Unlock();
4226
+ Status s = Close();
4227
+ mutex_.Lock();
4228
+ if (!s.ok()) {
4229
+ reject_new_background_jobs_ = false;
4230
+ }
4231
+ return s;
3974
4232
  } else {
3975
4233
  return error_handler_.GetBGError();
3976
4234
  }