@nxtedition/rocksdb 8.2.8 → 9.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (483) hide show
  1. package/binding.cc +0 -21
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +20 -10
  3. package/deps/rocksdb/rocksdb/Makefile +37 -25
  4. package/deps/rocksdb/rocksdb/README.md +29 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +25 -2
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +35 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +229 -74
  8. package/deps/rocksdb/rocksdb/cache/cache_helpers.cc +2 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +4 -3
  10. package/deps/rocksdb/rocksdb/cache/cache_test.cc +58 -95
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +4 -2
  12. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -3
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +2683 -496
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.h +580 -159
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +145 -42
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +20 -1
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +391 -17
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +7 -5
  19. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +309 -212
  20. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +0 -32
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +439 -12
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +44 -2
  23. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +11 -1
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +16 -3
  25. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache.cc +119 -0
  26. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache.h +155 -0
  27. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +711 -0
  28. package/deps/rocksdb/rocksdb/cache/typed_cache.h +17 -11
  29. package/deps/rocksdb/rocksdb/crash_test.mk +14 -0
  30. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +28 -12
  31. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -0
  32. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +2 -1
  33. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -1
  34. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -1
  35. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  36. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +1 -1
  37. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +20 -22
  38. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +1 -2
  39. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +2 -3
  41. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +1 -1
  42. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +8 -0
  43. package/deps/rocksdb/rocksdb/db/blob/db_blob_index_test.cc +7 -3
  44. package/deps/rocksdb/rocksdb/db/builder.cc +35 -10
  45. package/deps/rocksdb/rocksdb/db/c.cc +233 -6
  46. package/deps/rocksdb/rocksdb/db/c_test.c +140 -6
  47. package/deps/rocksdb/rocksdb/db/column_family.cc +110 -51
  48. package/deps/rocksdb/rocksdb/db/column_family.h +34 -2
  49. package/deps/rocksdb/rocksdb/db/column_family_test.cc +314 -7
  50. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +106 -23
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +47 -9
  53. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +10 -11
  54. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -6
  55. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +2 -2
  56. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +148 -60
  57. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +22 -7
  58. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -0
  59. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -4
  60. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +33 -23
  61. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +14 -5
  62. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -11
  63. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +3 -0
  64. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +90 -4
  65. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +170 -95
  66. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +3 -1
  67. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +32 -58
  68. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +3 -1
  69. package/deps/rocksdb/rocksdb/db/convenience.cc +20 -3
  70. package/deps/rocksdb/rocksdb/db/convenience_impl.h +15 -0
  71. package/deps/rocksdb/rocksdb/db/corruption_test.cc +17 -0
  72. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +1 -0
  73. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +46 -10
  74. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -3
  75. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +74 -15
  76. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +27 -3
  77. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +850 -44
  78. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +2 -29
  79. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +275 -1
  80. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +52 -19
  81. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +6 -5
  82. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +733 -320
  83. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +155 -66
  84. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +516 -155
  85. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +8 -4
  86. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +2 -1
  87. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +17 -4
  88. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +100 -35
  89. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +95 -50
  90. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +13 -9
  91. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +136 -79
  92. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +6 -95
  93. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +31 -22
  94. package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +6 -0
  95. package/deps/rocksdb/rocksdb/db/db_iter.cc +85 -57
  96. package/deps/rocksdb/rocksdb/db/db_iter.h +11 -2
  97. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +29 -0
  98. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +276 -21
  99. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +35 -0
  100. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +4 -11
  101. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +193 -7
  102. package/deps/rocksdb/rocksdb/db/db_options_test.cc +294 -26
  103. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +26 -36
  104. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +364 -0
  105. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +13 -3
  106. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +52 -0
  107. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +74 -1
  108. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +22 -4
  109. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +1 -1
  110. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +1 -0
  111. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +282 -167
  112. package/deps/rocksdb/rocksdb/db/db_test.cc +180 -49
  113. package/deps/rocksdb/rocksdb/db/db_test2.cc +84 -12
  114. package/deps/rocksdb/rocksdb/db/db_test_util.cc +25 -12
  115. package/deps/rocksdb/rocksdb/db/db_test_util.h +45 -2
  116. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +14 -1
  117. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +245 -0
  118. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +480 -1
  119. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +6 -6
  120. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -2
  121. package/deps/rocksdb/rocksdb/db/dbformat.cc +36 -0
  122. package/deps/rocksdb/rocksdb/db/dbformat.h +169 -20
  123. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +129 -0
  124. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +2 -0
  125. package/deps/rocksdb/rocksdb/db/error_handler.cc +67 -34
  126. package/deps/rocksdb/rocksdb/db/error_handler.h +13 -9
  127. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  128. package/deps/rocksdb/rocksdb/db/event_helpers.cc +4 -0
  129. package/deps/rocksdb/rocksdb/db/experimental.cc +2 -1
  130. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +4 -4
  131. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +17 -8
  132. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +144 -4
  133. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -1
  134. package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -4
  135. package/deps/rocksdb/rocksdb/db/flush_job.cc +105 -17
  136. package/deps/rocksdb/rocksdb/db/flush_job.h +27 -4
  137. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +90 -12
  138. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +2 -3
  139. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +159 -91
  140. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +19 -10
  141. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +143 -0
  142. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -1
  143. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  144. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -1
  145. package/deps/rocksdb/rocksdb/db/log_reader.h +3 -2
  146. package/deps/rocksdb/rocksdb/db/log_test.cc +17 -21
  147. package/deps/rocksdb/rocksdb/db/log_writer.cc +1 -1
  148. package/deps/rocksdb/rocksdb/db/log_writer.h +3 -2
  149. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -3
  150. package/deps/rocksdb/rocksdb/db/memtable.cc +70 -83
  151. package/deps/rocksdb/rocksdb/db/memtable.h +45 -1
  152. package/deps/rocksdb/rocksdb/db/memtable_list.cc +45 -11
  153. package/deps/rocksdb/rocksdb/db/memtable_list.h +43 -2
  154. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +91 -5
  155. package/deps/rocksdb/rocksdb/db/merge_helper.cc +330 -115
  156. package/deps/rocksdb/rocksdb/db/merge_helper.h +100 -12
  157. package/deps/rocksdb/rocksdb/db/merge_operator.cc +82 -0
  158. package/deps/rocksdb/rocksdb/db/merge_test.cc +267 -0
  159. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +5 -2
  160. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +4 -4
  161. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +3 -0
  162. package/deps/rocksdb/rocksdb/db/prefix_test.cc +1 -0
  163. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +4 -0
  164. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +4 -0
  165. package/deps/rocksdb/rocksdb/db/repair.cc +25 -7
  166. package/deps/rocksdb/rocksdb/db/repair_test.cc +143 -2
  167. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +459 -74
  168. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +105 -69
  169. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +83 -46
  170. package/deps/rocksdb/rocksdb/db/table_cache.cc +76 -54
  171. package/deps/rocksdb/rocksdb/db/table_cache.h +18 -12
  172. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  173. package/deps/rocksdb/rocksdb/db/version_builder.cc +0 -1
  174. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +236 -204
  175. package/deps/rocksdb/rocksdb/db/version_edit.cc +66 -4
  176. package/deps/rocksdb/rocksdb/db/version_edit.h +58 -10
  177. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +80 -8
  178. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +12 -0
  179. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +86 -17
  180. package/deps/rocksdb/rocksdb/db/version_set.cc +207 -110
  181. package/deps/rocksdb/rocksdb/db/version_set.h +36 -15
  182. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -5
  183. package/deps/rocksdb/rocksdb/db/version_set_test.cc +47 -26
  184. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +525 -0
  185. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +6 -22
  186. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -20
  187. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +0 -29
  188. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +46 -0
  189. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +40 -0
  190. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper_test.cc +39 -0
  191. package/deps/rocksdb/rocksdb/db/write_batch.cc +55 -20
  192. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +3 -0
  193. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +16 -0
  194. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  195. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +4 -4
  196. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +4 -7
  197. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +88 -10
  198. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +37 -13
  199. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +110 -58
  200. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +42 -0
  201. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +68 -17
  202. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +34 -0
  203. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +8 -1
  204. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +429 -237
  205. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +13 -6
  206. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +21 -14
  207. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_wide_merge_operator.cc +51 -0
  208. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_wide_merge_operator.h +27 -0
  209. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +3 -6
  210. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +2 -0
  211. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +29 -38
  212. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +302 -101
  213. package/deps/rocksdb/rocksdb/env/env.cc +6 -2
  214. package/deps/rocksdb/rocksdb/env/env_encryption.cc +11 -165
  215. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +0 -17
  216. package/deps/rocksdb/rocksdb/env/env_posix.cc +6 -2
  217. package/deps/rocksdb/rocksdb/env/env_test.cc +86 -2
  218. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  219. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +79 -0
  220. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +34 -0
  221. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -0
  222. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +15 -4
  223. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +100 -70
  224. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +64 -18
  225. package/deps/rocksdb/rocksdb/file/file_util.cc +10 -5
  226. package/deps/rocksdb/rocksdb/file/file_util.h +13 -1
  227. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +1225 -97
  228. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +72 -33
  229. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -16
  230. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +23 -12
  231. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +3 -0
  232. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +40 -14
  233. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +163 -91
  234. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +112 -2
  235. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +108 -16
  236. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +11 -0
  237. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -0
  238. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +42 -2
  239. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +1 -1
  240. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +92 -12
  241. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +34 -4
  242. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -109
  243. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +91 -13
  244. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +8 -3
  245. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +10 -4
  246. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +7 -0
  247. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +1 -1
  248. package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +55 -4
  249. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +130 -22
  250. package/deps/rocksdb/rocksdb/include/rocksdb/port_defs.h +4 -0
  251. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +9 -0
  252. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +92 -9
  253. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +2 -1
  254. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +5 -1
  255. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +37 -2
  256. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +35 -0
  257. package/deps/rocksdb/rocksdb/include/rocksdb/system_clock.h +15 -0
  258. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +7 -1
  259. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +20 -3
  260. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +7 -0
  261. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +7 -0
  262. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +6 -1
  263. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +33 -2
  264. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +2 -1
  265. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +14 -0
  266. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +42 -2
  267. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +0 -3
  268. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  269. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +53 -2
  270. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +3 -2
  271. package/deps/rocksdb/rocksdb/memory/arena_test.cc +18 -11
  272. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +4 -3
  273. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +1 -1
  274. package/deps/rocksdb/rocksdb/microbench/README.md +60 -0
  275. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +69 -34
  276. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +1 -1
  277. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +22 -1
  278. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +18 -7
  279. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +14 -0
  280. package/deps/rocksdb/rocksdb/options/cf_options.cc +19 -0
  281. package/deps/rocksdb/rocksdb/options/cf_options.h +10 -2
  282. package/deps/rocksdb/rocksdb/options/customizable_test.cc +6 -1
  283. package/deps/rocksdb/rocksdb/options/db_options.cc +54 -2
  284. package/deps/rocksdb/rocksdb/options/db_options.h +4 -0
  285. package/deps/rocksdb/rocksdb/options/options.cc +15 -1
  286. package/deps/rocksdb/rocksdb/options/options_helper.cc +18 -0
  287. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +14 -4
  288. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -1
  289. package/deps/rocksdb/rocksdb/plugin/README.md +43 -0
  290. package/deps/rocksdb/rocksdb/port/README +10 -0
  291. package/deps/rocksdb/rocksdb/port/mmap.h +20 -0
  292. package/deps/rocksdb/rocksdb/port/port_example.h +1 -1
  293. package/deps/rocksdb/rocksdb/port/port_posix.cc +1 -1
  294. package/deps/rocksdb/rocksdb/port/port_posix.h +7 -4
  295. package/deps/rocksdb/rocksdb/port/stack_trace.cc +32 -12
  296. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -1
  297. package/deps/rocksdb/rocksdb/port/win/port_win.h +5 -2
  298. package/deps/rocksdb/rocksdb/src.mk +10 -1
  299. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  300. package/deps/rocksdb/rocksdb/table/block_based/block.cc +48 -22
  301. package/deps/rocksdb/rocksdb/table/block_based/block.h +60 -12
  302. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +116 -43
  303. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +9 -6
  304. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +321 -49
  305. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +98 -4
  306. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +233 -98
  307. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +58 -23
  308. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +12 -8
  309. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +52 -24
  310. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +219 -51
  311. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +41 -8
  312. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -1
  313. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +3 -1
  314. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +26 -7
  315. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +50 -18
  316. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +20 -8
  317. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +232 -71
  318. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +6 -6
  319. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +44 -26
  320. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +2 -1
  321. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  322. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +31 -16
  323. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +97 -58
  324. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +2 -2
  325. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +6 -0
  326. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +36 -19
  327. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +3 -1
  328. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +114 -70
  329. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -3
  330. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +11 -7
  331. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +15 -3
  332. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +6 -3
  333. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  334. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +14 -13
  335. package/deps/rocksdb/rocksdb/table/block_fetcher.h +4 -0
  336. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +9 -2
  337. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +1 -0
  338. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +6 -2
  339. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +1 -2
  340. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +2 -3
  341. package/deps/rocksdb/rocksdb/table/format.cc +175 -33
  342. package/deps/rocksdb/rocksdb/table/format.h +63 -10
  343. package/deps/rocksdb/rocksdb/table/get_context.cc +52 -89
  344. package/deps/rocksdb/rocksdb/table/get_context.h +12 -3
  345. package/deps/rocksdb/rocksdb/table/internal_iterator.h +11 -0
  346. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +29 -1
  347. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +22 -2
  348. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +12 -4
  349. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -0
  350. package/deps/rocksdb/rocksdb/table/mock_table.cc +8 -3
  351. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +10 -5
  352. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +10 -1
  353. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +1 -2
  354. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -3
  355. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +45 -9
  356. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +1 -0
  357. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +24 -1
  358. package/deps/rocksdb/rocksdb/table/table_builder.h +6 -2
  359. package/deps/rocksdb/rocksdb/table/table_properties.cc +6 -0
  360. package/deps/rocksdb/rocksdb/table/table_reader.h +6 -0
  361. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -22
  362. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +31 -0
  363. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +2 -1
  364. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +19 -7
  365. package/deps/rocksdb/rocksdb/test_util/sync_point.h +3 -1
  366. package/deps/rocksdb/rocksdb/test_util/testutil.cc +29 -0
  367. package/deps/rocksdb/rocksdb/test_util/testutil.h +19 -0
  368. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +3 -3
  369. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +87 -65
  370. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +221 -33
  371. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +36 -0
  372. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
  373. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -0
  374. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +33 -11
  375. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +4 -0
  376. package/deps/rocksdb/rocksdb/unreleased_history/README.txt +73 -0
  377. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +27 -0
  378. package/deps/rocksdb/rocksdb/unreleased_history/behavior_changes/.gitkeep +0 -0
  379. package/deps/rocksdb/rocksdb/unreleased_history/bug_fixes/.gitkeep +0 -0
  380. package/deps/rocksdb/rocksdb/unreleased_history/new_features/.gitkeep +0 -0
  381. package/deps/rocksdb/rocksdb/unreleased_history/performance_improvements/.gitkeep +0 -0
  382. package/deps/rocksdb/rocksdb/unreleased_history/public_api_changes/.gitkeep +0 -0
  383. package/deps/rocksdb/rocksdb/unreleased_history/release.sh +104 -0
  384. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +5 -0
  385. package/deps/rocksdb/rocksdb/util/bloom_impl.h +3 -3
  386. package/deps/rocksdb/rocksdb/util/bloom_test.cc +32 -11
  387. package/deps/rocksdb/rocksdb/util/cast_util.h +24 -0
  388. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -0
  389. package/deps/rocksdb/rocksdb/util/comparator.cc +55 -8
  390. package/deps/rocksdb/rocksdb/util/compression.cc +4 -4
  391. package/deps/rocksdb/rocksdb/util/compression.h +119 -35
  392. package/deps/rocksdb/rocksdb/util/core_local.h +2 -1
  393. package/deps/rocksdb/rocksdb/util/crc32c.cc +7 -1
  394. package/deps/rocksdb/rocksdb/util/distributed_mutex.h +1 -1
  395. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +4 -4
  396. package/deps/rocksdb/rocksdb/util/filelock_test.cc +3 -0
  397. package/deps/rocksdb/rocksdb/util/hash.h +7 -3
  398. package/deps/rocksdb/rocksdb/util/hash_test.cc +44 -0
  399. package/deps/rocksdb/rocksdb/util/math.h +58 -6
  400. package/deps/rocksdb/rocksdb/util/math128.h +29 -7
  401. package/deps/rocksdb/rocksdb/util/mutexlock.h +35 -27
  402. package/deps/rocksdb/rocksdb/util/overload.h +23 -0
  403. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +53 -18
  404. package/deps/rocksdb/rocksdb/util/rate_limiter_impl.h +6 -1
  405. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +90 -19
  406. package/deps/rocksdb/rocksdb/util/single_thread_executor.h +1 -0
  407. package/deps/rocksdb/rocksdb/util/slice_test.cc +30 -0
  408. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  409. package/deps/rocksdb/rocksdb/util/stop_watch.h +1 -1
  410. package/deps/rocksdb/rocksdb/util/string_util.cc +39 -0
  411. package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
  412. package/deps/rocksdb/rocksdb/util/thread_operation.h +10 -1
  413. package/deps/rocksdb/rocksdb/util/udt_util.cc +385 -0
  414. package/deps/rocksdb/rocksdb/util/udt_util.h +192 -1
  415. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +461 -0
  416. package/deps/rocksdb/rocksdb/util/write_batch_util.cc +25 -0
  417. package/deps/rocksdb/rocksdb/util/write_batch_util.h +80 -0
  418. package/deps/rocksdb/rocksdb/util/xxhash.h +0 -3
  419. package/deps/rocksdb/rocksdb/util/xxph3.h +0 -4
  420. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -4
  421. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +71 -26
  422. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +7 -6
  423. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +1 -1
  424. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +2 -3
  425. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +6 -11
  426. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -2
  427. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +4 -5
  428. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +1 -0
  429. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +20 -16
  430. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +11 -7
  431. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +2 -2
  432. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +7 -1
  433. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +3 -0
  434. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +12 -3
  435. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +1 -2
  436. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +7 -4
  437. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +2 -3
  438. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +2 -2
  439. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  440. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +13 -0
  441. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +23 -8
  442. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +9 -6
  443. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +37 -12
  444. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +272 -33
  445. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +15 -9
  446. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +4 -1
  447. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +76 -20
  448. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +18 -9
  449. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +195 -23
  450. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +19 -12
  451. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +88 -1
  452. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -1
  453. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +43 -17
  454. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +6 -3
  455. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +73 -24
  456. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +19 -4
  457. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +60 -107
  458. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +41 -12
  459. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +6 -3
  460. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +15 -8
  461. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h +1 -1
  462. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +10 -5
  463. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  464. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +1 -1
  465. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +59 -28
  466. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +127 -120
  467. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +129 -59
  468. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +111 -14
  469. package/deps/rocksdb/rocksdb.gyp +6 -2
  470. package/index.js +0 -8
  471. package/package.json +1 -1
  472. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  473. package/prebuilds/linux-x64/node.napi.node +0 -0
  474. package/deps/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake +0 -7
  475. package/deps/rocksdb/rocksdb/cmake/modules/FindJeMalloc.cmake +0 -29
  476. package/deps/rocksdb/rocksdb/cmake/modules/FindNUMA.cmake +0 -29
  477. package/deps/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake +0 -29
  478. package/deps/rocksdb/rocksdb/cmake/modules/FindTBB.cmake +0 -33
  479. package/deps/rocksdb/rocksdb/cmake/modules/Findgflags.cmake +0 -29
  480. package/deps/rocksdb/rocksdb/cmake/modules/Findlz4.cmake +0 -29
  481. package/deps/rocksdb/rocksdb/cmake/modules/Finduring.cmake +0 -26
  482. package/deps/rocksdb/rocksdb/cmake/modules/Findzstd.cmake +0 -29
  483. package/deps/rocksdb/rocksdb/cmake/modules/ReadVersion.cmake +0 -10
@@ -21,7 +21,9 @@
21
21
  #include "monitoring/thread_status_util.h"
22
22
  #include "test_util/sync_point.h"
23
23
  #include "util/cast_util.h"
24
+ #include "util/coding.h"
24
25
  #include "util/concurrent_task_limiter_impl.h"
26
+ #include "util/udt_util.h"
25
27
 
26
28
  namespace ROCKSDB_NAMESPACE {
27
29
 
@@ -76,8 +78,43 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
76
78
  return false;
77
79
  }
78
80
 
81
+ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT(
82
+ const FlushRequest& flush_req) {
83
+ mutex_.AssertHeld();
84
+ assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
85
+ ColumnFamilyData* cfd = flush_req.cfd_to_max_mem_id_to_persist.begin()->first;
86
+ uint64_t max_memtable_id =
87
+ flush_req.cfd_to_max_mem_id_to_persist.begin()->second;
88
+ if (cfd->IsDropped() ||
89
+ !cfd->ShouldPostponeFlushToRetainUDT(max_memtable_id)) {
90
+ return false;
91
+ }
92
+ // Check if holding on the flush will cause entering write stall mode.
93
+ // Write stall entered because of the accumulation of write buffers can be
94
+ // alleviated if we continue with the flush instead of postponing it.
95
+ const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
96
+
97
+ // Taking the status of the active Memtable into consideration so that we are
98
+ // not just checking if DB is currently already in write stall mode.
99
+ int mem_to_flush = cfd->mem()->ApproximateMemoryUsageFast() >=
100
+ cfd->mem()->write_buffer_size() / 2
101
+ ? 1
102
+ : 0;
103
+ WriteStallCondition write_stall =
104
+ ColumnFamilyData::GetWriteStallConditionAndCause(
105
+ cfd->imm()->NumNotFlushed() + mem_to_flush, /*num_l0_files=*/0,
106
+ /*num_compaction_needed_bytes=*/0, mutable_cf_options,
107
+ *cfd->ioptions())
108
+ .first;
109
+ if (write_stall != WriteStallCondition::kNormal) {
110
+ return false;
111
+ }
112
+ return true;
113
+ }
114
+
79
115
  IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
80
- VersionEdit* synced_wals) {
116
+ VersionEdit* synced_wals,
117
+ bool error_recovery_in_prog) {
81
118
  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
82
119
  InstrumentedMutexLock l(&log_write_mutex_);
83
120
  autovector<log::Writer*, 1> logs_to_sync;
@@ -103,7 +140,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
103
140
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
104
141
  "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
105
142
  log->get_log_number());
106
- if (error_handler_.IsRecoveryInProgress()) {
143
+ if (error_recovery_in_prog) {
107
144
  log->file()->reset_seen_error();
108
145
  }
109
146
  io_s = log->file()->Sync(immutable_db_options_.use_fsync);
@@ -112,7 +149,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
112
149
  }
113
150
 
114
151
  if (immutable_db_options_.recycle_log_file_num > 0) {
115
- if (error_handler_.IsRecoveryInProgress()) {
152
+ if (error_recovery_in_prog) {
116
153
  log->file()->reset_seen_error();
117
154
  }
118
155
  io_s = log->Close();
@@ -186,9 +223,10 @@ Status DBImpl::FlushMemTableToOutputFile(
186
223
  // `snapshot_seqs` has already been computed before this function starts.
187
224
  // Recording the max memtable ID ensures that the flush job does not flush
188
225
  // a memtable without knowing such snapshot(s).
189
- uint64_t max_memtable_id = needs_to_sync_closed_wals
190
- ? cfd->imm()->GetLatestMemTableID()
191
- : std::numeric_limits<uint64_t>::max();
226
+ uint64_t max_memtable_id =
227
+ needs_to_sync_closed_wals
228
+ ? cfd->imm()->GetLatestMemTableID(false /* for_atomic_flush */)
229
+ : std::numeric_limits<uint64_t>::max();
192
230
 
193
231
  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
194
232
  // existing memtables of the column family when PickMemTable() is called
@@ -197,7 +235,7 @@ Status DBImpl::FlushMemTableToOutputFile(
197
235
  // releases and re-acquires the db mutex. In the meantime, the application
198
236
  // can still insert into the memtables and increase the db's sequence number.
199
237
  // The application can take a snapshot, hoping that the latest visible state
200
- // to this snapshto is preserved. This is hard to guarantee since db mutex
238
+ // to this snapshot is preserved. This is hard to guarantee since db mutex
201
239
  // not held. This newly-created snapshot is not included in `snapshot_seqs`
202
240
  // and the flush job is unaware of its presence. Consequently, the flush job
203
241
  // may drop certain keys when generating the L0, causing incorrect data to be
@@ -214,7 +252,7 @@ Status DBImpl::FlushMemTableToOutputFile(
214
252
  GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
215
253
  &event_logger_, mutable_cf_options.report_bg_io_stats,
216
254
  true /* sync_output_directory */, true /* write_manifest */, thread_pri,
217
- io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
255
+ io_tracer_, seqno_to_time_mapping_, db_id_, db_session_id_,
218
256
  cfd->GetFullHistoryTsLow(), &blob_callback_);
219
257
  FileMetaData file_meta;
220
258
 
@@ -225,8 +263,10 @@ Status DBImpl::FlushMemTableToOutputFile(
225
263
  // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
226
264
  // times.
227
265
  VersionEdit synced_wals;
266
+ bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress();
228
267
  mutex_.Unlock();
229
- log_io_s = SyncClosedLogs(job_context, &synced_wals);
268
+ log_io_s =
269
+ SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog);
230
270
  mutex_.Lock();
231
271
  if (log_io_s.ok() && synced_wals.IsWalAddition()) {
232
272
  const ReadOptions read_options(Env::IOActivity::kFlush);
@@ -248,6 +288,24 @@ Status DBImpl::FlushMemTableToOutputFile(
248
288
  // If the log sync failed, we do not need to pick memtable. Otherwise,
249
289
  // num_flush_not_started_ needs to be rollback.
250
290
  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
291
+ // Exit a flush due to bg error should not set bg error again.
292
+ bool skip_set_bg_error = false;
293
+ if (s.ok() && !error_handler_.GetBGError().ok() &&
294
+ error_handler_.IsBGWorkStopped() &&
295
+ flush_reason != FlushReason::kErrorRecovery &&
296
+ flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
297
+ // Error recovery in progress, should not pick memtable which excludes
298
+ // them from being picked up by recovery flush.
299
+ // This ensures that when bg error is set, no new flush can pick
300
+ // memtables.
301
+ skip_set_bg_error = true;
302
+ s = error_handler_.GetBGError();
303
+ assert(!s.ok());
304
+ ROCKS_LOG_BUFFER(log_buffer,
305
+ "[JOB %d] Skip flush due to background error %s",
306
+ job_context->job_id, s.ToString().c_str());
307
+ }
308
+
251
309
  if (s.ok()) {
252
310
  flush_job.PickMemTable();
253
311
  need_cancel = true;
@@ -268,7 +326,8 @@ Status DBImpl::FlushMemTableToOutputFile(
268
326
  // is unlocked by the current thread.
269
327
  if (s.ok()) {
270
328
  s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
271
- &switched_to_mempurge);
329
+ &switched_to_mempurge, &skip_set_bg_error,
330
+ &error_handler_);
272
331
  need_cancel = false;
273
332
  }
274
333
 
@@ -309,7 +368,8 @@ Status DBImpl::FlushMemTableToOutputFile(
309
368
  }
310
369
  }
311
370
 
312
- if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
371
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
372
+ !skip_set_bg_error) {
313
373
  if (log_io_s.ok()) {
314
374
  // Error while writing to MANIFEST.
315
375
  // In fact, versions_->io_status() can also be the result of renaming
@@ -466,7 +526,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
466
526
  GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
467
527
  &event_logger_, mutable_cf_options.report_bg_io_stats,
468
528
  false /* sync_output_directory */, false /* write_manifest */,
469
- thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
529
+ thread_pri, io_tracer_, seqno_to_time_mapping_, db_id_, db_session_id_,
470
530
  cfd->GetFullHistoryTsLow(), &blob_callback_));
471
531
  }
472
532
 
@@ -490,8 +550,10 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
490
550
  // TODO (yanqin) investigate whether we should sync the closed logs for
491
551
  // single column family case.
492
552
  VersionEdit synced_wals;
553
+ bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress();
493
554
  mutex_.Unlock();
494
- log_io_s = SyncClosedLogs(job_context, &synced_wals);
555
+ log_io_s =
556
+ SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog);
495
557
  mutex_.Lock();
496
558
  if (log_io_s.ok() && synced_wals.IsWalAddition()) {
497
559
  const ReadOptions read_options(Env::IOActivity::kFlush);
@@ -521,6 +583,21 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
521
583
  pick_status.push_back(false);
522
584
  }
523
585
 
586
+ bool flush_for_recovery =
587
+ bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery ||
588
+ bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecoveryRetryFlush;
589
+ bool skip_set_bg_error = false;
590
+
591
+ if (s.ok() && !error_handler_.GetBGError().ok() &&
592
+ error_handler_.IsBGWorkStopped() && !flush_for_recovery) {
593
+ s = error_handler_.GetBGError();
594
+ skip_set_bg_error = true;
595
+ assert(!s.ok());
596
+ ROCKS_LOG_BUFFER(log_buffer,
597
+ "[JOB %d] Skip flush due to background error %s",
598
+ job_context->job_id, s.ToString().c_str());
599
+ }
600
+
524
601
  if (s.ok()) {
525
602
  for (int i = 0; i != num_cfs; ++i) {
526
603
  jobs[i]->PickMemTable();
@@ -585,7 +662,10 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
585
662
  }
586
663
  }
587
664
  }
588
- } else {
665
+ } else if (!skip_set_bg_error) {
666
+ // When `skip_set_bg_error` is true, no memtable is picked so
667
+ // there is no need to call Cancel() or RollbackMemtableFlush().
668
+ //
589
669
  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
590
670
  // it is not because of CF drop.
591
671
  // Have to cancel the flush jobs that have NOT executed because we need to
@@ -598,8 +678,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
598
678
  for (int i = 0; i != num_cfs; ++i) {
599
679
  if (exec_status[i].second.ok() && exec_status[i].first) {
600
680
  auto& mems = jobs[i]->GetMemTables();
601
- cfds[i]->imm()->RollbackMemtableFlush(mems,
602
- file_meta[i].fd.GetNumber());
681
+ cfds[i]->imm()->RollbackMemtableFlush(
682
+ mems, /*rollback_succeeding_memtables=*/false);
603
683
  }
604
684
  }
605
685
  }
@@ -641,10 +721,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
641
721
  };
642
722
 
643
723
  bool resuming_from_bg_err =
644
- error_handler_.IsDBStopped() ||
645
- (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery ||
646
- bg_flush_args[0].flush_reason_ ==
647
- FlushReason::kErrorRecoveryRetryFlush);
724
+ error_handler_.IsDBStopped() || flush_for_recovery;
648
725
  while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) {
649
726
  std::pair<Status, bool> res = wait_to_install_func();
650
727
 
@@ -655,15 +732,27 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
655
732
  s = res.first;
656
733
  break;
657
734
  } else if (!res.second) {
735
+ // we are the oldest immutable memtable
736
+ break;
737
+ }
738
+ // We are not the oldest immutable memtable
739
+ TEST_SYNC_POINT_CALLBACK(
740
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitCV", &res);
741
+ //
742
+ // If bg work is stopped, recovery thread first calls
743
+ // WaitForBackgroundWork() before proceeding to flush for recovery. This
744
+ // flush can block WaitForBackgroundWork() while waiting for recovery
745
+ // flush to install result. To avoid this deadlock, we should abort here
746
+ // if there is background error.
747
+ if (!flush_for_recovery && error_handler_.IsBGWorkStopped() &&
748
+ !error_handler_.GetBGError().ok()) {
749
+ s = error_handler_.GetBGError();
750
+ assert(!s.ok());
658
751
  break;
659
752
  }
660
753
  atomic_flush_install_cv_.Wait();
661
754
 
662
- resuming_from_bg_err =
663
- error_handler_.IsDBStopped() ||
664
- (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery ||
665
- bg_flush_args[0].flush_reason_ ==
666
- FlushReason::kErrorRecoveryRetryFlush);
755
+ resuming_from_bg_err = error_handler_.IsDBStopped() || flush_for_recovery;
667
756
  }
668
757
 
669
758
  if (!resuming_from_bg_err) {
@@ -679,6 +768,17 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
679
768
  // installation.
680
769
  s = error_handler_.GetRecoveryError();
681
770
  }
771
+ // Since we are not installing these memtables, need to rollback
772
+ // to allow future flush job to pick up these memtables.
773
+ if (!s.ok()) {
774
+ for (int i = 0; i != num_cfs; ++i) {
775
+ assert(exec_status[i].first);
776
+ assert(exec_status[i].second.ok());
777
+ auto& mems = jobs[i]->GetMemTables();
778
+ cfds[i]->imm()->RollbackMemtableFlush(
779
+ mems, /*rollback_succeeding_memtables=*/false);
780
+ }
781
+ }
682
782
  }
683
783
 
684
784
  if (s.ok()) {
@@ -782,7 +882,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
782
882
 
783
883
  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
784
884
  // it is not because of CF drop.
785
- if (!s.ok() && !s.IsColumnFamilyDropped()) {
885
+ if (!s.ok() && !s.IsColumnFamilyDropped() && !skip_set_bg_error) {
786
886
  if (log_io_s.ok()) {
787
887
  // Error while writing to MANIFEST.
788
888
  // In fact, versions_->io_status() can also be the result of renaming
@@ -852,8 +952,8 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
852
952
  }
853
953
  }
854
954
  mutex_.Lock();
855
- // no need to signal bg_cv_ as it will be signaled at the end of the
856
- // flush process.
955
+ // no need to signal bg_cv_ as it will be signaled at the end of the
956
+ // flush process.
857
957
  }
858
958
 
859
959
  void DBImpl::NotifyOnFlushCompleted(
@@ -912,26 +1012,14 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
912
1012
  end_without_ts, "" /*trim_ts*/);
913
1013
  }
914
1014
 
915
- std::string begin_str;
916
- std::string end_str;
1015
+ std::string begin_str, end_str;
1016
+ auto [begin, end] =
1017
+ MaybeAddTimestampsToRange(begin_without_ts, end_without_ts, ts_sz,
1018
+ &begin_str, &end_str, false /*exclusive_end*/);
917
1019
 
918
- // CompactRange compact all keys: [begin, end] inclusively. Add maximum
919
- // timestamp to include all `begin` keys, and add minimal timestamp to include
920
- // all `end` keys.
921
- if (begin_without_ts != nullptr) {
922
- AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz);
923
- }
924
- if (end_without_ts != nullptr) {
925
- AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz);
926
- }
927
- Slice begin(begin_str);
928
- Slice end(end_str);
929
-
930
- Slice* begin_with_ts = begin_without_ts ? &begin : nullptr;
931
- Slice* end_with_ts = end_without_ts ? &end : nullptr;
932
-
933
- return CompactRangeInternal(options, column_family, begin_with_ts,
934
- end_with_ts, "" /*trim_ts*/);
1020
+ return CompactRangeInternal(
1021
+ options, column_family, begin.has_value() ? &begin.value() : nullptr,
1022
+ end.has_value() ? &end.value() : nullptr, "" /*trim_ts*/);
935
1023
  }
936
1024
 
937
1025
  Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
@@ -1066,7 +1154,6 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1066
1154
  std::numeric_limits<uint64_t>::max(), trim_ts);
1067
1155
  } else {
1068
1156
  int first_overlapped_level = kInvalidLevel;
1069
- int max_overlapped_level = kInvalidLevel;
1070
1157
  {
1071
1158
  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
1072
1159
  Version* current_version = super_version->current;
@@ -1142,10 +1229,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1142
1229
  begin, end);
1143
1230
  }
1144
1231
  if (overlap) {
1145
- if (first_overlapped_level == kInvalidLevel) {
1146
- first_overlapped_level = level;
1147
- }
1148
- max_overlapped_level = level;
1232
+ first_overlapped_level = level;
1233
+ break;
1149
1234
  }
1150
1235
  }
1151
1236
  CleanupSuperVersion(super_version);
@@ -1159,7 +1244,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1159
1244
  end, exclusive, true /* disallow_trivial_move */,
1160
1245
  std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
1161
1246
  trim_ts);
1162
- final_output_level = max_overlapped_level;
1247
+ final_output_level = first_overlapped_level;
1163
1248
  } else {
1164
1249
  assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel);
1165
1250
  uint64_t next_file_number = versions_->current_next_file_number();
@@ -1171,7 +1256,29 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1171
1256
  int level = first_overlapped_level;
1172
1257
  final_output_level = level;
1173
1258
  int output_level = 0, base_level = 0;
1174
- while (level < max_overlapped_level || level == 0) {
1259
+ for (;;) {
1260
+ // Always allow L0 -> L1 compaction
1261
+ if (level > 0) {
1262
+ if (cfd->ioptions()->level_compaction_dynamic_level_bytes) {
1263
+ assert(final_output_level < cfd->ioptions()->num_levels);
1264
+ if (final_output_level + 1 == cfd->ioptions()->num_levels) {
1265
+ break;
1266
+ }
1267
+ } else {
1268
+ // TODO(cbi): there is still a race condition here where
1269
+ // if a background compaction compacts some file beyond
1270
+ // current()->storage_info()->num_non_empty_levels() right after
1271
+ // the check here.This should happen very infrequently and should
1272
+ // not happen once a user populates the last level of the LSM.
1273
+ InstrumentedMutexLock l(&mutex_);
1274
+ // num_non_empty_levels may be lower after a compaction, so
1275
+ // we check for >= here.
1276
+ if (final_output_level + 1 >=
1277
+ cfd->current()->storage_info()->num_non_empty_levels()) {
1278
+ break;
1279
+ }
1280
+ }
1281
+ }
1175
1282
  output_level = level + 1;
1176
1283
  if (cfd->ioptions()->level_compaction_dynamic_level_bytes &&
1177
1284
  level == 0) {
@@ -1203,17 +1310,8 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1203
1310
  if (s.ok()) {
1204
1311
  assert(final_output_level > 0);
1205
1312
  // bottommost level intra-level compaction
1206
- // TODO(cbi): this preserves earlier behavior where if
1207
- // max_overlapped_level = 0 and bottommost_level_compaction is
1208
- // kIfHaveCompactionFilter, we only do a L0 -> LBase compaction
1209
- // and do not do intra-LBase compaction even when user configures
1210
- // compaction filter. We may want to still do a LBase -> LBase
1211
- // compaction in case there is some file in LBase that did not go
1212
- // through L0 -> LBase compaction, and hence did not go through
1213
- // compaction filter.
1214
1313
  if ((options.bottommost_level_compaction ==
1215
1314
  BottommostLevelCompaction::kIfHaveCompactionFilter &&
1216
- max_overlapped_level != 0 &&
1217
1315
  (cfd->ioptions()->compaction_filter != nullptr ||
1218
1316
  cfd->ioptions()->compaction_filter_factory != nullptr)) ||
1219
1317
  options.bottommost_level_compaction ==
@@ -1221,10 +1319,11 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
1221
1319
  options.bottommost_level_compaction ==
1222
1320
  BottommostLevelCompaction::kForce) {
1223
1321
  // Use `next_file_number` as `max_file_num_to_ignore` to avoid
1224
- // rewriting newly compacted files when it is kForceOptimized.
1322
+ // rewriting newly compacted files when it is kForceOptimized
1323
+ // or kIfHaveCompactionFilter with compaction filter set.
1225
1324
  s = RunManualCompaction(
1226
1325
  cfd, final_output_level, final_output_level, options, begin,
1227
- end, exclusive, !trim_ts.empty() /* disallow_trivial_move */,
1326
+ end, exclusive, true /* disallow_trivial_move */,
1228
1327
  next_file_number /* max_file_num_to_ignore */, trim_ts);
1229
1328
  }
1230
1329
  }
@@ -1375,6 +1474,14 @@ Status DBImpl::CompactFilesImpl(
1375
1474
  }
1376
1475
  }
1377
1476
 
1477
+ if (cfd->ioptions()->allow_ingest_behind &&
1478
+ output_level >= cfd->ioptions()->num_levels - 1) {
1479
+ return Status::InvalidArgument(
1480
+ "Exceed the maximum output level defined by "
1481
+ "the current compaction algorithm with ingest_behind --- " +
1482
+ std::to_string(cfd->ioptions()->num_levels - 1));
1483
+ }
1484
+
1378
1485
  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
1379
1486
  &input_set, cf_meta, output_level);
1380
1487
  TEST_SYNC_POINT("DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles");
@@ -1419,7 +1526,8 @@ Status DBImpl::CompactFilesImpl(
1419
1526
  // without releasing the lock, so we're guaranteed a compaction can be formed.
1420
1527
  assert(c != nullptr);
1421
1528
 
1422
- c->SetInputVersion(version);
1529
+ c->FinalizeInputInfo(version);
1530
+
1423
1531
  // deletion compaction currently not allowed in CompactFiles.
1424
1532
  assert(!c->deletion_compaction());
1425
1533
 
@@ -1469,7 +1577,12 @@ Status DBImpl::CompactFilesImpl(
1469
1577
  TEST_SYNC_POINT("CompactFilesImpl:3");
1470
1578
  mutex_.Lock();
1471
1579
 
1472
- Status status = compaction_job.Install(*c->mutable_cf_options());
1580
+ bool compaction_released = false;
1581
+ Status status =
1582
+ compaction_job.Install(*c->mutable_cf_options(), &compaction_released);
1583
+ if (!compaction_released) {
1584
+ c->ReleaseCompactionFiles(s);
1585
+ }
1473
1586
  if (status.ok()) {
1474
1587
  assert(compaction_job.io_status().ok());
1475
1588
  InstallSuperVersionAndScheduleWork(c->column_family_data(),
@@ -1480,7 +1593,6 @@ Status DBImpl::CompactFilesImpl(
1480
1593
  // not check compaction_job.io_status() explicitly if we're not calling
1481
1594
  // SetBGError
1482
1595
  compaction_job.io_status().PermitUncheckedError();
1483
- c->ReleaseCompactionFiles(s);
1484
1596
  // Need to make sure SstFileManager does its bookkeeping
1485
1597
  auto sfm = static_cast<SstFileManagerImpl*>(
1486
1598
  immutable_db_options_.sst_file_manager.get());
@@ -1492,7 +1604,7 @@ Status DBImpl::CompactFilesImpl(
1492
1604
 
1493
1605
  if (compaction_job_info != nullptr) {
1494
1606
  BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
1495
- job_context->job_id, version, compaction_job_info);
1607
+ job_context->job_id, compaction_job_info);
1496
1608
  }
1497
1609
 
1498
1610
  if (status.ok()) {
@@ -1589,21 +1701,18 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
1589
1701
  }
1590
1702
 
1591
1703
  c->SetNotifyOnCompactionCompleted();
1592
- Version* current = cfd->current();
1593
- current->Ref();
1594
1704
  // release lock while notifying events
1595
1705
  mutex_.Unlock();
1596
1706
  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
1597
1707
  {
1598
1708
  CompactionJobInfo info{};
1599
- BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info);
1709
+ BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info);
1600
1710
  for (auto listener : immutable_db_options_.listeners) {
1601
1711
  listener->OnCompactionBegin(this, info);
1602
1712
  }
1603
1713
  info.status.PermitUncheckedError();
1604
1714
  }
1605
1715
  mutex_.Lock();
1606
- current->Unref();
1607
1716
  }
1608
1717
 
1609
1718
  void DBImpl::NotifyOnCompactionCompleted(
@@ -1621,21 +1730,17 @@ void DBImpl::NotifyOnCompactionCompleted(
1621
1730
  return;
1622
1731
  }
1623
1732
 
1624
- Version* current = cfd->current();
1625
- current->Ref();
1626
1733
  // release lock while notifying events
1627
1734
  mutex_.Unlock();
1628
1735
  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
1629
1736
  {
1630
1737
  CompactionJobInfo info{};
1631
- BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
1632
- &info);
1738
+ BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info);
1633
1739
  for (auto listener : immutable_db_options_.listeners) {
1634
1740
  listener->OnCompactionCompleted(this, info);
1635
1741
  }
1636
1742
  }
1637
1743
  mutex_.Lock();
1638
- current->Unref();
1639
1744
  // no need to signal bg_cv_ as it will be signaled at the end of the
1640
1745
  // flush process.
1641
1746
  }
@@ -1758,7 +1863,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
1758
1863
  f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
1759
1864
  f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
1760
1865
  f->file_checksum, f->file_checksum_func_name, f->unique_id,
1761
- f->compensated_range_deletion_size, f->tail_size);
1866
+ f->compensated_range_deletion_size, f->tail_size,
1867
+ f->user_defined_timestamps_persisted);
1762
1868
  }
1763
1869
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
1764
1870
  "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -1808,6 +1914,37 @@ int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
1808
1914
  ->mutable_cf_options.level0_stop_writes_trigger;
1809
1915
  }
1810
1916
 
1917
+ Status DBImpl::FlushAllColumnFamilies(const FlushOptions& flush_options,
1918
+ FlushReason flush_reason) {
1919
+ mutex_.AssertHeld();
1920
+ Status status;
1921
+ if (immutable_db_options_.atomic_flush) {
1922
+ mutex_.Unlock();
1923
+ status = AtomicFlushMemTables(flush_options, flush_reason);
1924
+ if (status.IsColumnFamilyDropped()) {
1925
+ status = Status::OK();
1926
+ }
1927
+ mutex_.Lock();
1928
+ } else {
1929
+ for (auto cfd : versions_->GetRefedColumnFamilySet()) {
1930
+ if (cfd->IsDropped()) {
1931
+ continue;
1932
+ }
1933
+ mutex_.Unlock();
1934
+ status = FlushMemTable(cfd, flush_options, flush_reason);
1935
+ TEST_SYNC_POINT("DBImpl::FlushAllColumnFamilies:1");
1936
+ TEST_SYNC_POINT("DBImpl::FlushAllColumnFamilies:2");
1937
+ mutex_.Lock();
1938
+ if (!status.ok() && !status.IsColumnFamilyDropped()) {
1939
+ break;
1940
+ } else if (status.IsColumnFamilyDropped()) {
1941
+ status = Status::OK();
1942
+ }
1943
+ }
1944
+ }
1945
+ return status;
1946
+ }
1947
+
1811
1948
  Status DBImpl::Flush(const FlushOptions& flush_options,
1812
1949
  ColumnFamilyHandle* column_family) {
1813
1950
  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
@@ -2099,7 +2236,8 @@ void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
2099
2236
  // cfd may be null, see DBImpl::ScheduleFlushes
2100
2237
  continue;
2101
2238
  }
2102
- uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
2239
+ uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(
2240
+ immutable_db_options_.atomic_flush /* for_atomic_flush */);
2103
2241
  req->cfd_to_max_mem_id_to_persist.emplace(cfd, max_memtable_id);
2104
2242
  }
2105
2243
  }
@@ -2143,15 +2281,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2143
2281
  }
2144
2282
  WaitForPendingWrites();
2145
2283
 
2146
- if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
2147
- (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
2148
- // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
2149
- // auto retry resume, we want to avoid creating new small memtables.
2150
- // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
2151
- // will iterate through all the CFs and call FlushMemtable during auto
2152
- // retry resume, it is possible that in some CFs,
2153
- // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will
2154
- // be created and scheduled, status::OK() will be returned.
2284
+ if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
2155
2285
  s = SwitchMemtable(cfd, &context);
2156
2286
  }
2157
2287
  const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
@@ -2160,10 +2290,10 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2160
2290
  !cached_recoverable_state_empty_.load()) {
2161
2291
  FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}};
2162
2292
  flush_reqs.emplace_back(std::move(req));
2163
- memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID());
2293
+ memtable_ids_to_wait.emplace_back(
2294
+ cfd->imm()->GetLatestMemTableID(false /* for_atomic_flush */));
2164
2295
  }
2165
- if (immutable_db_options_.persist_stats_to_disk &&
2166
- flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
2296
+ if (immutable_db_options_.persist_stats_to_disk) {
2167
2297
  ColumnFamilyData* cfd_stats =
2168
2298
  versions_->GetColumnFamilySet()->GetColumnFamily(
2169
2299
  kPersistentStatsColumnFamilyName);
@@ -2189,7 +2319,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2189
2319
  FlushRequest req{flush_reason, {{cfd_stats, flush_memtable_id}}};
2190
2320
  flush_reqs.emplace_back(std::move(req));
2191
2321
  memtable_ids_to_wait.emplace_back(
2192
- cfd_stats->imm()->GetLatestMemTableID());
2322
+ cfd_stats->imm()->GetLatestMemTableID(
2323
+ false /* for_atomic_flush */));
2193
2324
  }
2194
2325
  }
2195
2326
  }
@@ -2240,8 +2371,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
2240
2371
  }
2241
2372
  s = WaitForFlushMemTables(
2242
2373
  cfds, flush_memtable_ids,
2243
- (flush_reason == FlushReason::kErrorRecovery ||
2244
- flush_reason == FlushReason::kErrorRecoveryRetryFlush));
2374
+ flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
2245
2375
  InstrumentedMutexLock lock_guard(&mutex_);
2246
2376
  for (auto* tmp_cfd : cfds) {
2247
2377
  tmp_cfd->UnrefAndTryDelete();
@@ -2336,8 +2466,7 @@ Status DBImpl::AtomicFlushMemTables(
2336
2466
  }
2337
2467
 
2338
2468
  for (auto cfd : cfds) {
2339
- if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
2340
- flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
2469
+ if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
2341
2470
  continue;
2342
2471
  }
2343
2472
  cfd->Ref();
@@ -2382,8 +2511,7 @@ Status DBImpl::AtomicFlushMemTables(
2382
2511
  }
2383
2512
  s = WaitForFlushMemTables(
2384
2513
  cfds, flush_memtable_ids,
2385
- (flush_reason == FlushReason::kErrorRecovery ||
2386
- flush_reason == FlushReason::kErrorRecoveryRetryFlush));
2514
+ flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
2387
2515
  InstrumentedMutexLock lock_guard(&mutex_);
2388
2516
  for (auto* cfd : cfds) {
2389
2517
  cfd->UnrefAndTryDelete();
@@ -2392,6 +2520,68 @@ Status DBImpl::AtomicFlushMemTables(
2392
2520
  return s;
2393
2521
  }
2394
2522
 
2523
+ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
2524
+ bool wait) {
2525
+ mutex_.AssertHeld();
2526
+ assert(flush_reason == FlushReason::kErrorRecoveryRetryFlush ||
2527
+ flush_reason == FlushReason::kCatchUpAfterErrorRecovery);
2528
+
2529
+ // Collect referenced CFDs.
2530
+ autovector<ColumnFamilyData*> cfds;
2531
+ for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
2532
+ if (!cfd->IsDropped() && cfd->initialized() &&
2533
+ cfd->imm()->NumNotFlushed() != 0) {
2534
+ cfd->Ref();
2535
+ cfd->imm()->FlushRequested();
2536
+ cfds.push_back(cfd);
2537
+ }
2538
+ }
2539
+
2540
+ // Submit flush requests for all immutable memtables needing flush.
2541
+ // `flush_memtable_ids` will be populated such that all immutable
2542
+ // memtables eligible for flush are waited on before this function
2543
+ // returns.
2544
+ autovector<uint64_t> flush_memtable_ids;
2545
+ if (immutable_db_options_.atomic_flush) {
2546
+ FlushRequest flush_req;
2547
+ GenerateFlushRequest(cfds, flush_reason, &flush_req);
2548
+ SchedulePendingFlush(flush_req);
2549
+ for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
2550
+ flush_memtable_ids.push_back(iter.second);
2551
+ }
2552
+ } else {
2553
+ for (auto cfd : cfds) {
2554
+ flush_memtable_ids.push_back(
2555
+ cfd->imm()->GetLatestMemTableID(false /* for_atomic_flush */));
2556
+ // Impose no bound on the highest memtable ID flushed. There is no
2557
+ // reason to do so outside of atomic flush.
2558
+ FlushRequest flush_req{
2559
+ flush_reason,
2560
+ {{cfd,
2561
+ std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}};
2562
+ SchedulePendingFlush(flush_req);
2563
+ }
2564
+ }
2565
+ MaybeScheduleFlushOrCompaction();
2566
+
2567
+ Status s;
2568
+ if (wait) {
2569
+ mutex_.Unlock();
2570
+ autovector<const uint64_t*> flush_memtable_id_ptrs;
2571
+ for (auto& flush_memtable_id : flush_memtable_ids) {
2572
+ flush_memtable_id_ptrs.push_back(&flush_memtable_id);
2573
+ }
2574
+ s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs,
2575
+ true /* resuming_from_bg_err */);
2576
+ mutex_.Lock();
2577
+ }
2578
+
2579
+ for (auto* cfd : cfds) {
2580
+ cfd->UnrefAndTryDelete();
2581
+ }
2582
+ return s;
2583
+ }
2584
+
2395
2585
  // Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
2396
2586
  // cause write stall, for example if one memtable is being flushed already.
2397
2587
  // This method tries to avoid write stall (similar to CompactRange() behavior)
@@ -2455,8 +2645,11 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
2455
2645
  // check whether one extra immutable memtable or an extra L0 file would
2456
2646
  // cause write stalling mode to be entered. It could still enter stall
2457
2647
  // mode due to pending compaction bytes, but that's less common
2648
+ // No extra immutable Memtable will be created if the current Memtable is
2649
+ // empty.
2650
+ int mem_to_flush = cfd->mem()->IsEmpty() ? 0 : 1;
2458
2651
  write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
2459
- cfd->imm()->NumNotFlushed() + 1,
2652
+ cfd->imm()->NumNotFlushed() + mem_to_flush,
2460
2653
  vstorage->l0_delay_trigger_count() + 1,
2461
2654
  vstorage->estimated_compaction_needed_bytes(),
2462
2655
  mutable_cf_options, *cfd->ioptions())
@@ -2602,6 +2795,11 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
2602
2795
  // There has been a hard error and this call is not part of the recovery
2603
2796
  // sequence. Bail out here so we don't get into an endless loop of
2604
2797
  // scheduling BG work which will again call this function
2798
+ //
2799
+ // Note that a non-recovery flush can still be scheduled if
2800
+ // error_handler_.IsRecoveryInProgress() returns true. We rely on
2801
+ // BackgroundCallFlush() to check flush reason and drop non-recovery
2802
+ // flushes.
2605
2803
  return;
2606
2804
  } else if (shutting_down_.load(std::memory_order_acquire)) {
2607
2805
  // DB is being deleted; no more background compactions
@@ -2612,6 +2810,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
2612
2810
  env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
2613
2811
  while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
2614
2812
  bg_flush_scheduled_ < bg_job_limits.max_flushes) {
2813
+ TEST_SYNC_POINT_CALLBACK(
2814
+ "DBImpl::MaybeScheduleFlushOrCompaction:BeforeSchedule",
2815
+ &unscheduled_flushes_);
2615
2816
  bg_flush_scheduled_++;
2616
2817
  FlushThreadArg* fta = new FlushThreadArg;
2617
2818
  fta->db_ = this;
@@ -2721,7 +2922,7 @@ ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
2721
2922
 
2722
2923
  DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
2723
2924
  assert(!flush_queue_.empty());
2724
- FlushRequest flush_req = flush_queue_.front();
2925
+ FlushRequest flush_req = std::move(flush_queue_.front());
2725
2926
  flush_queue_.pop_front();
2726
2927
  if (!immutable_db_options_.atomic_flush) {
2727
2928
  assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
@@ -2765,6 +2966,9 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
2765
2966
 
2766
2967
  void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
2767
2968
  mutex_.AssertHeld();
2969
+ if (reject_new_background_jobs_) {
2970
+ return;
2971
+ }
2768
2972
  if (flush_req.cfd_to_max_mem_id_to_persist.empty()) {
2769
2973
  return;
2770
2974
  }
@@ -2794,6 +2998,9 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
2794
2998
 
2795
2999
  void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
2796
3000
  mutex_.AssertHeld();
3001
+ if (reject_new_background_jobs_) {
3002
+ return;
3003
+ }
2797
3004
  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
2798
3005
  AddToCompactionQueue(cfd);
2799
3006
  ++unscheduled_compactions_;
@@ -2803,6 +3010,9 @@ void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
2803
3010
  void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
2804
3011
  FileType type, uint64_t number, int job_id) {
2805
3012
  mutex_.AssertHeld();
3013
+ if (reject_new_background_jobs_) {
3014
+ return;
3015
+ }
2806
3016
  PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
2807
3017
  purge_files_.insert({{number, std::move(file_info)}});
2808
3018
  }
@@ -2891,6 +3101,7 @@ void DBImpl::UnscheduleFlushCallback(void* arg) {
2891
3101
 
2892
3102
  Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
2893
3103
  LogBuffer* log_buffer, FlushReason* reason,
3104
+ bool* flush_rescheduled_to_retain_udt,
2894
3105
  Env::Priority thread_pri) {
2895
3106
  mutex_.AssertHeld();
2896
3107
 
@@ -2916,14 +3127,61 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
2916
3127
  autovector<ColumnFamilyData*> column_families_not_to_flush;
2917
3128
  while (!flush_queue_.empty()) {
2918
3129
  // This cfd is already referenced
2919
- const FlushRequest& flush_req = PopFirstFromFlushQueue();
3130
+ FlushRequest flush_req = PopFirstFromFlushQueue();
2920
3131
  FlushReason flush_reason = flush_req.flush_reason;
3132
+ if (!error_handler_.GetBGError().ok() && error_handler_.IsBGWorkStopped() &&
3133
+ flush_reason != FlushReason::kErrorRecovery &&
3134
+ flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
3135
+ // Stop non-recovery flush when bg work is stopped
3136
+ // Note that we drop the flush request here.
3137
+ // Recovery thread should schedule further flushes after bg error
3138
+ // is cleared.
3139
+ status = error_handler_.GetBGError();
3140
+ assert(!status.ok());
3141
+ ROCKS_LOG_BUFFER(log_buffer,
3142
+ "[JOB %d] Abort flush due to background error %s",
3143
+ job_context->job_id, status.ToString().c_str());
3144
+ *reason = flush_reason;
3145
+ for (auto item : flush_req.cfd_to_max_mem_id_to_persist) {
3146
+ item.first->UnrefAndTryDelete();
3147
+ }
3148
+ return status;
3149
+ }
3150
+ if (!immutable_db_options_.atomic_flush &&
3151
+ ShouldRescheduleFlushRequestToRetainUDT(flush_req)) {
3152
+ assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1);
3153
+ ColumnFamilyData* cfd =
3154
+ flush_req.cfd_to_max_mem_id_to_persist.begin()->first;
3155
+ if (cfd->UnrefAndTryDelete()) {
3156
+ return Status::OK();
3157
+ }
3158
+ ROCKS_LOG_BUFFER(log_buffer,
3159
+ "FlushRequest for column family %s is re-scheduled to "
3160
+ "retain user-defined timestamps.",
3161
+ cfd->GetName().c_str());
3162
+ // Reschedule the `FlushRequest` as is without checking dropped column
3163
+ // family etc. The follow-up job will do the check anyways, so save the
3164
+ // duplication. Column family is deduplicated by `SchdulePendingFlush` and
3165
+ // `PopFirstFromFlushQueue` contains at flush request enqueueing and
3166
+ // dequeueing time.
3167
+ // This flush request is rescheduled right after it's popped from the
3168
+ // queue while the db mutex is held, so there should be no other
3169
+ // FlushRequest for the same column family with higher `max_memtable_id`
3170
+ // in the queue to block the reschedule from succeeding.
3171
+ #ifndef NDEBUG
3172
+ flush_req.reschedule_count += 1;
3173
+ #endif /* !NDEBUG */
3174
+ SchedulePendingFlush(flush_req);
3175
+ *reason = flush_reason;
3176
+ *flush_rescheduled_to_retain_udt = true;
3177
+ return Status::TryAgain();
3178
+ }
2921
3179
  superversion_contexts.clear();
2922
3180
  superversion_contexts.reserve(
2923
3181
  flush_req.cfd_to_max_mem_id_to_persist.size());
2924
3182
 
2925
- for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
2926
- ColumnFamilyData* cfd = iter.first;
3183
+ for (const auto& [cfd, max_memtable_id] :
3184
+ flush_req.cfd_to_max_mem_id_to_persist) {
2927
3185
  if (cfd->GetMempurgeUsed()) {
2928
3186
  // If imm() contains silent memtables (e.g.: because
2929
3187
  // MemPurge was activated), requesting a flush will
@@ -2937,10 +3195,16 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
2937
3195
  continue;
2938
3196
  }
2939
3197
  superversion_contexts.emplace_back(SuperVersionContext(true));
2940
- bg_flush_args.emplace_back(cfd, iter.second,
3198
+ bg_flush_args.emplace_back(cfd, max_memtable_id,
2941
3199
  &(superversion_contexts.back()), flush_reason);
2942
3200
  }
2943
- if (!bg_flush_args.empty()) {
3201
+ // `MaybeScheduleFlushOrCompaction` schedules as many `BackgroundCallFlush`
3202
+ // jobs as the number of `FlushRequest` in the `flush_queue_`, a.k.a
3203
+ // `unscheduled_flushes_`. So it's sufficient to make each `BackgroundFlush`
3204
+ // handle one `FlushRequest` and each have a Status returned.
3205
+ if (!bg_flush_args.empty() || !column_families_not_to_flush.empty()) {
3206
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundFlush:CheckFlushRequest:cb",
3207
+ const_cast<int*>(&flush_req.reschedule_count));
2944
3208
  break;
2945
3209
  }
2946
3210
  }
@@ -3002,11 +3266,20 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
3002
3266
  pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
3003
3267
  CaptureCurrentFileNumberInPendingOutputs()));
3004
3268
  FlushReason reason;
3005
-
3006
- Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
3007
- &reason, thread_pri);
3008
- if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
3009
- reason != FlushReason::kErrorRecovery) {
3269
+ bool flush_rescheduled_to_retain_udt = false;
3270
+ Status s =
3271
+ BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason,
3272
+ &flush_rescheduled_to_retain_udt, thread_pri);
3273
+ if (s.IsTryAgain() && flush_rescheduled_to_retain_udt) {
3274
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
3275
+ mutex_.Unlock();
3276
+ TEST_SYNC_POINT_CALLBACK("DBImpl::AfterRetainUDTReschedule:cb", nullptr);
3277
+ immutable_db_options_.clock->SleepForMicroseconds(
3278
+ 100000); // prevent hot loop
3279
+ mutex_.Lock();
3280
+ } else if (!s.ok() && !s.IsShutdownInProgress() &&
3281
+ !s.IsColumnFamilyDropped() &&
3282
+ reason != FlushReason::kErrorRecovery) {
3010
3283
  // Wait a little bit before retrying background flush in
3011
3284
  // case this is an environmental problem and we do not want to
3012
3285
  // chew up resources for failed flushes for the duration of
@@ -3016,9 +3289,9 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
3016
3289
  bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
3017
3290
  mutex_.Unlock();
3018
3291
  ROCKS_LOG_ERROR(immutable_db_options_.info_log,
3019
- "Waiting after background flush error: %s"
3292
+ "[JOB %d] Waiting after background flush error: %s"
3020
3293
  "Accumulated background error counts: %" PRIu64,
3021
- s.ToString().c_str(), error_cnt);
3294
+ job_context.job_id, s.ToString().c_str(), error_cnt);
3022
3295
  log_buffer.FlushBufferToLog();
3023
3296
  LogFlush(immutable_db_options_.info_log);
3024
3297
  immutable_db_options_.clock->SleepForMicroseconds(1000000);
@@ -3027,29 +3300,33 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
3027
3300
 
3028
3301
  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
3029
3302
  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
3030
-
3031
- // If flush failed, we want to delete all temporary files that we might have
3032
- // created. Thus, we force full scan in FindObsoleteFiles()
3033
- FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
3034
- !s.IsColumnFamilyDropped());
3035
- // delete unnecessary files if any, this is done outside the mutex
3036
- if (job_context.HaveSomethingToClean() ||
3037
- job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
3038
- mutex_.Unlock();
3039
- TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
3040
- // Have to flush the info logs before bg_flush_scheduled_--
3041
- // because if bg_flush_scheduled_ becomes 0 and the lock is
3042
- // released, the deconstructor of DB can kick in and destroy all the
3043
- // states of DB so info_log might not be available after that point.
3044
- // It also applies to access other states that DB owns.
3045
- log_buffer.FlushBufferToLog();
3046
- if (job_context.HaveSomethingToDelete()) {
3047
- PurgeObsoleteFiles(job_context);
3303
+ // There is no need to do these clean up if the flush job is rescheduled
3304
+ // to retain user-defined timestamps because the job doesn't get to the
3305
+ // stage of actually flushing the MemTables.
3306
+ if (!flush_rescheduled_to_retain_udt) {
3307
+ // If flush failed, we want to delete all temporary files that we might
3308
+ // have created. Thus, we force full scan in FindObsoleteFiles()
3309
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
3310
+ !s.IsColumnFamilyDropped());
3311
+ // delete unnecessary files if any, this is done outside the mutex
3312
+ if (job_context.HaveSomethingToClean() ||
3313
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
3314
+ mutex_.Unlock();
3315
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
3316
+ // Have to flush the info logs before bg_flush_scheduled_--
3317
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
3318
+ // released, the deconstructor of DB can kick in and destroy all the
3319
+ // states of DB so info_log might not be available after that point.
3320
+ // It also applies to access other states that DB owns.
3321
+ log_buffer.FlushBufferToLog();
3322
+ if (job_context.HaveSomethingToDelete()) {
3323
+ PurgeObsoleteFiles(job_context);
3324
+ }
3325
+ job_context.Clean();
3326
+ mutex_.Lock();
3048
3327
  }
3049
- job_context.Clean();
3050
- mutex_.Lock();
3328
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
3051
3329
  }
3052
- TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
3053
3330
 
3054
3331
  assert(num_running_flushes_ > 0);
3055
3332
  num_running_flushes_--;
@@ -3256,8 +3533,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3256
3533
 
3257
3534
  std::unique_ptr<TaskLimiterToken> task_token;
3258
3535
 
3259
- // InternalKey manual_end_storage;
3260
- // InternalKey* manual_end = &manual_end_storage;
3261
3536
  bool sfm_reserved_compact_space = false;
3262
3537
  if (is_manual) {
3263
3538
  ManualCompactionState* m = manual_compaction;
@@ -3393,6 +3668,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3393
3668
  }
3394
3669
 
3395
3670
  IOStatus io_s;
3671
+ bool compaction_released = false;
3396
3672
  if (!c) {
3397
3673
  // Nothing to do
3398
3674
  ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
@@ -3415,7 +3691,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3415
3691
  }
3416
3692
  status = versions_->LogAndApply(
3417
3693
  c->column_family_data(), *c->mutable_cf_options(), read_options,
3418
- c->edit(), &mutex_, directories_.GetDbDir());
3694
+ c->edit(), &mutex_, directories_.GetDbDir(),
3695
+ /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
3696
+ [&c, &compaction_released](const Status& s) {
3697
+ c->ReleaseCompactionFiles(s);
3698
+ compaction_released = true;
3699
+ });
3419
3700
  io_s = versions_->io_status();
3420
3701
  InstallSuperVersionAndScheduleWork(c->column_family_data(),
3421
3702
  &job_context->superversion_contexts[0],
@@ -3423,6 +3704,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3423
3704
  ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
3424
3705
  c->column_family_data()->GetName().c_str(),
3425
3706
  c->num_input_files(0));
3707
+ if (status.ok() && io_s.ok()) {
3708
+ UpdateDeletionCompactionStats(c);
3709
+ }
3426
3710
  *made_progress = true;
3427
3711
  TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
3428
3712
  c->column_family_data());
@@ -3457,7 +3741,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3457
3741
  f->oldest_blob_file_number, f->oldest_ancester_time,
3458
3742
  f->file_creation_time, f->epoch_number, f->file_checksum,
3459
3743
  f->file_checksum_func_name, f->unique_id,
3460
- f->compensated_range_deletion_size, f->tail_size);
3744
+ f->compensated_range_deletion_size, f->tail_size,
3745
+ f->user_defined_timestamps_persisted);
3461
3746
 
3462
3747
  ROCKS_LOG_BUFFER(
3463
3748
  log_buffer,
@@ -3480,7 +3765,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3480
3765
  }
3481
3766
  status = versions_->LogAndApply(
3482
3767
  c->column_family_data(), *c->mutable_cf_options(), read_options,
3483
- c->edit(), &mutex_, directories_.GetDbDir());
3768
+ c->edit(), &mutex_, directories_.GetDbDir(),
3769
+ /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
3770
+ [&c, &compaction_released](const Status& s) {
3771
+ c->ReleaseCompactionFiles(s);
3772
+ compaction_released = true;
3773
+ });
3484
3774
  io_s = versions_->io_status();
3485
3775
  // Use latest MutableCFOptions
3486
3776
  InstallSuperVersionAndScheduleWork(c->column_family_data(),
@@ -3530,6 +3820,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3530
3820
  // Transfer requested token, so it doesn't need to do it again.
3531
3821
  ca->prepicked_compaction->task_token = std::move(task_token);
3532
3822
  ++bg_bottom_compaction_scheduled_;
3823
+ assert(c == nullptr);
3533
3824
  env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
3534
3825
  this, &DBImpl::UnscheduleCompactionCallback);
3535
3826
  } else {
@@ -3573,8 +3864,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3573
3864
  compaction_job.Run().PermitUncheckedError();
3574
3865
  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
3575
3866
  mutex_.Lock();
3576
-
3577
- status = compaction_job.Install(*c->mutable_cf_options());
3867
+ status =
3868
+ compaction_job.Install(*c->mutable_cf_options(), &compaction_released);
3578
3869
  io_s = compaction_job.io_status();
3579
3870
  if (status.ok()) {
3580
3871
  InstallSuperVersionAndScheduleWork(c->column_family_data(),
@@ -3593,7 +3884,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3593
3884
  }
3594
3885
 
3595
3886
  if (c != nullptr) {
3596
- c->ReleaseCompactionFiles(status);
3887
+ if (!compaction_released) {
3888
+ c->ReleaseCompactionFiles(status);
3889
+ } else {
3890
+ #ifndef NDEBUG
3891
+ // Sanity checking that compaction files are freed.
3892
+ for (size_t i = 0; i < c->num_input_levels(); i++) {
3893
+ for (size_t j = 0; j < c->inputs(i)->size(); j++) {
3894
+ assert(!c->input(i, j)->being_compacted);
3895
+ }
3896
+ }
3897
+ std::unordered_set<Compaction*>* cip = c->column_family_data()
3898
+ ->compaction_picker()
3899
+ ->compactions_in_progress();
3900
+ assert(cip->find(c.get()) == cip->end());
3901
+ #endif
3902
+ }
3903
+
3597
3904
  *made_progress = true;
3598
3905
 
3599
3906
  // Need to make sure SstFileManager does its bookkeeping
@@ -3778,10 +4085,31 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
3778
4085
  return false;
3779
4086
  }
3780
4087
 
4088
+ void DBImpl::UpdateDeletionCompactionStats(
4089
+ const std::unique_ptr<Compaction>& c) {
4090
+ if (c == nullptr) {
4091
+ return;
4092
+ }
4093
+
4094
+ CompactionReason reason = c->compaction_reason();
4095
+
4096
+ switch (reason) {
4097
+ case CompactionReason::kFIFOMaxSize:
4098
+ RecordTick(stats_, FIFO_MAX_SIZE_COMPACTIONS);
4099
+ break;
4100
+ case CompactionReason::kFIFOTtl:
4101
+ RecordTick(stats_, FIFO_TTL_COMPACTIONS);
4102
+ break;
4103
+ default:
4104
+ assert(false);
4105
+ break;
4106
+ }
4107
+ }
4108
+
3781
4109
  void DBImpl::BuildCompactionJobInfo(
3782
4110
  const ColumnFamilyData* cfd, Compaction* c, const Status& st,
3783
4111
  const CompactionJobStats& compaction_job_stats, const int job_id,
3784
- const Version* current, CompactionJobInfo* compaction_job_info) const {
4112
+ CompactionJobInfo* compaction_job_info) const {
3785
4113
  assert(compaction_job_info != nullptr);
3786
4114
  compaction_job_info->cf_id = cfd->GetID();
3787
4115
  compaction_job_info->cf_name = cfd->GetName();
@@ -3791,7 +4119,12 @@ void DBImpl::BuildCompactionJobInfo(
3791
4119
  compaction_job_info->base_input_level = c->start_level();
3792
4120
  compaction_job_info->output_level = c->output_level();
3793
4121
  compaction_job_info->stats = compaction_job_stats;
3794
- compaction_job_info->table_properties = c->GetOutputTableProperties();
4122
+ const auto& input_table_properties = c->GetInputTableProperties();
4123
+ const auto& output_table_properties = c->GetOutputTableProperties();
4124
+ compaction_job_info->table_properties.insert(input_table_properties.begin(),
4125
+ input_table_properties.end());
4126
+ compaction_job_info->table_properties.insert(output_table_properties.begin(),
4127
+ output_table_properties.end());
3795
4128
  compaction_job_info->compaction_reason = c->compaction_reason();
3796
4129
  compaction_job_info->compression = c->output_compression();
3797
4130
 
@@ -3805,15 +4138,9 @@ void DBImpl::BuildCompactionJobInfo(
3805
4138
  compaction_job_info->input_files.push_back(fn);
3806
4139
  compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
3807
4140
  static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
3808
- if (compaction_job_info->table_properties.count(fn) == 0) {
3809
- std::shared_ptr<const TableProperties> tp;
3810
- auto s = current->GetTableProperties(read_options, &tp, fmd, &fn);
3811
- if (s.ok()) {
3812
- compaction_job_info->table_properties[fn] = tp;
3813
- }
3814
- }
3815
4141
  }
3816
4142
  }
4143
+
3817
4144
  for (const auto& newf : c->edit()->GetNewFiles()) {
3818
4145
  const FileMetaData& meta = newf.second;
3819
4146
  const FileDescriptor& desc = meta.fd;
@@ -3957,20 +4284,54 @@ void DBImpl::GetSnapshotContext(
3957
4284
  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
3958
4285
  }
3959
4286
 
3960
- Status DBImpl::WaitForCompact(bool abort_on_pause) {
4287
+ Status DBImpl::WaitForCompact(
4288
+ const WaitForCompactOptions& wait_for_compact_options) {
3961
4289
  InstrumentedMutexLock l(&mutex_);
4290
+ if (wait_for_compact_options.flush) {
4291
+ Status s = DBImpl::FlushAllColumnFamilies(FlushOptions(),
4292
+ FlushReason::kManualFlush);
4293
+ if (!s.ok()) {
4294
+ return s;
4295
+ }
4296
+ } else if (wait_for_compact_options.close_db &&
4297
+ has_unpersisted_data_.load(std::memory_order_relaxed) &&
4298
+ !mutable_db_options_.avoid_flush_during_shutdown) {
4299
+ Status s =
4300
+ DBImpl::FlushAllColumnFamilies(FlushOptions(), FlushReason::kShutDown);
4301
+ if (!s.ok()) {
4302
+ return s;
4303
+ }
4304
+ }
4305
+ TEST_SYNC_POINT("DBImpl::WaitForCompact:StartWaiting");
4306
+ const auto deadline = immutable_db_options_.clock->NowMicros() +
4307
+ wait_for_compact_options.timeout.count();
3962
4308
  for (;;) {
3963
4309
  if (shutting_down_.load(std::memory_order_acquire)) {
3964
4310
  return Status::ShutdownInProgress();
3965
4311
  }
3966
- if (bg_work_paused_ && abort_on_pause) {
4312
+ if (bg_work_paused_ && wait_for_compact_options.abort_on_pause) {
3967
4313
  return Status::Aborted();
3968
4314
  }
3969
4315
  if ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
3970
4316
  bg_flush_scheduled_ || unscheduled_compactions_ ||
3971
- unscheduled_flushes_) &&
4317
+ unscheduled_flushes_ || error_handler_.IsRecoveryInProgress()) &&
3972
4318
  (error_handler_.GetBGError().ok())) {
3973
- bg_cv_.Wait();
4319
+ if (wait_for_compact_options.timeout.count()) {
4320
+ if (bg_cv_.TimedWait(deadline)) {
4321
+ return Status::TimedOut();
4322
+ }
4323
+ } else {
4324
+ bg_cv_.Wait();
4325
+ }
4326
+ } else if (wait_for_compact_options.close_db) {
4327
+ reject_new_background_jobs_ = true;
4328
+ mutex_.Unlock();
4329
+ Status s = Close();
4330
+ mutex_.Lock();
4331
+ if (!s.ok()) {
4332
+ reject_new_background_jobs_ = false;
4333
+ }
4334
+ return s;
3974
4335
  } else {
3975
4336
  return error_handler_.GetBGError();
3976
4337
  }