@nxtedition/rocksdb 13.5.7 → 13.5.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (509) hide show
  1. package/binding.cc +248 -70
  2. package/binding.gyp +2 -2
  3. package/deps/rocksdb/rocksdb/BUCK +12 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -0
  5. package/deps/rocksdb/rocksdb/Makefile +28 -23
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -1
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -2
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +43 -39
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +0 -1
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -3
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +1 -3
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +11 -1
  15. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +13 -5
  16. package/deps/rocksdb/rocksdb/crash_test.mk +61 -15
  17. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +136 -45
  18. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +34 -16
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +10 -7
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -2
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +1 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +12 -9
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +3 -4
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +2 -2
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +3 -4
  26. package/deps/rocksdb/rocksdb/db/builder.cc +22 -8
  27. package/deps/rocksdb/rocksdb/db/builder.h +5 -4
  28. package/deps/rocksdb/rocksdb/db/c.cc +556 -15
  29. package/deps/rocksdb/rocksdb/db/c_test.c +133 -12
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +114 -50
  31. package/deps/rocksdb/rocksdb/db/column_family.h +53 -36
  32. package/deps/rocksdb/rocksdb/db/column_family_test.cc +6 -6
  33. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +0 -1
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +95 -70
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +71 -51
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -86
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +26 -68
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +0 -122
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +453 -258
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +117 -92
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +0 -1
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +38 -38
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +24 -17
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +34 -45
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -31
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -3
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +1 -1
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +10 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +82 -34
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +267 -179
  53. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +4 -1
  54. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +273 -89
  55. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +300 -14
  56. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +4 -4
  57. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +2 -2
  58. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +28 -23
  59. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +69 -51
  60. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +522 -245
  61. package/deps/rocksdb/rocksdb/db/convenience.cc +15 -4
  62. package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -3
  63. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +0 -2
  64. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +196 -17
  65. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +74 -62
  66. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +48 -0
  67. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +682 -250
  68. package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +0 -1
  69. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +3 -4
  70. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +11 -16
  71. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +57 -0
  72. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +2 -2
  73. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -1
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +540 -490
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +347 -188
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +584 -217
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +13 -9
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -7
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +40 -36
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -3
  81. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +751 -372
  82. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +35 -32
  83. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +24 -2
  84. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +125 -63
  85. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
  86. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +311 -196
  87. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +15 -5
  88. package/deps/rocksdb/rocksdb/db/db_iter.cc +42 -29
  89. package/deps/rocksdb/rocksdb/db/db_iter.h +96 -31
  90. package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +3 -4
  91. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +168 -228
  92. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +454 -0
  93. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +8 -8
  94. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +0 -1
  95. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +90 -0
  96. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +60 -2
  97. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +7 -3
  98. package/deps/rocksdb/rocksdb/db/db_options_test.cc +85 -27
  99. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -1
  100. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +0 -2
  101. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +114 -2
  102. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -1
  103. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +0 -1
  104. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +51 -3
  105. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +0 -1
  106. package/deps/rocksdb/rocksdb/db/db_test.cc +325 -18
  107. package/deps/rocksdb/rocksdb/db/db_test2.cc +644 -20
  108. package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
  109. package/deps/rocksdb/rocksdb/db/db_test_util.h +9 -0
  110. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +64 -45
  111. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +203 -14
  112. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +259 -30
  113. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +0 -1
  114. package/deps/rocksdb/rocksdb/db/db_write_test.cc +75 -1
  115. package/deps/rocksdb/rocksdb/db/dbformat.h +70 -6
  116. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +0 -190
  117. package/deps/rocksdb/rocksdb/db/error_handler.cc +22 -7
  118. package/deps/rocksdb/rocksdb/db/error_handler.h +16 -1
  119. package/deps/rocksdb/rocksdb/db/event_helpers.cc +41 -26
  120. package/deps/rocksdb/rocksdb/db/experimental.cc +4 -3
  121. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +464 -78
  122. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +166 -69
  123. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +54 -25
  124. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -3
  125. package/deps/rocksdb/rocksdb/db/flush_job.cc +98 -81
  126. package/deps/rocksdb/rocksdb/db/flush_job.h +4 -9
  127. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +80 -84
  128. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +1 -1
  129. package/deps/rocksdb/rocksdb/db/forward_iterator.h +2 -2
  130. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +12 -19
  131. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +0 -2
  132. package/deps/rocksdb/rocksdb/db/internal_stats.cc +41 -15
  133. package/deps/rocksdb/rocksdb/db/internal_stats.h +63 -52
  134. package/deps/rocksdb/rocksdb/db/job_context.h +59 -24
  135. package/deps/rocksdb/rocksdb/db/listener_test.cc +69 -10
  136. package/deps/rocksdb/rocksdb/db/log_format.h +11 -2
  137. package/deps/rocksdb/rocksdb/db/log_reader.cc +147 -34
  138. package/deps/rocksdb/rocksdb/db/log_reader.h +40 -11
  139. package/deps/rocksdb/rocksdb/db/log_test.cc +16 -3
  140. package/deps/rocksdb/rocksdb/db/log_writer.cc +102 -55
  141. package/deps/rocksdb/rocksdb/db/log_writer.h +21 -2
  142. package/deps/rocksdb/rocksdb/db/malloc_stats.h +0 -2
  143. package/deps/rocksdb/rocksdb/db/memtable.cc +16 -47
  144. package/deps/rocksdb/rocksdb/db/memtable.h +76 -12
  145. package/deps/rocksdb/rocksdb/db/memtable_list.cc +23 -20
  146. package/deps/rocksdb/rocksdb/db/memtable_list.h +9 -11
  147. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +18 -37
  148. package/deps/rocksdb/rocksdb/db/merge_context.h +2 -1
  149. package/deps/rocksdb/rocksdb/db/merge_test.cc +8 -0
  150. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +3 -5
  151. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +15 -7
  152. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +6 -3
  153. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +22 -4
  154. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +41 -1
  155. package/deps/rocksdb/rocksdb/db/prefix_test.cc +0 -1
  156. package/deps/rocksdb/rocksdb/db/repair.cc +29 -34
  157. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -1
  158. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +14 -15
  159. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +1 -3
  160. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +47 -1
  161. package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -3
  162. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +1 -3
  163. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +2 -1
  164. package/deps/rocksdb/rocksdb/db/version_builder.cc +2 -2
  165. package/deps/rocksdb/rocksdb/db/version_edit.cc +8 -37
  166. package/deps/rocksdb/rocksdb/db/version_edit.h +32 -1
  167. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +26 -18
  168. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -5
  169. package/deps/rocksdb/rocksdb/db/version_set.cc +282 -197
  170. package/deps/rocksdb/rocksdb/db/version_set.h +54 -57
  171. package/deps/rocksdb/rocksdb/db/version_set_test.cc +28 -35
  172. package/deps/rocksdb/rocksdb/db/version_util.h +2 -3
  173. package/deps/rocksdb/rocksdb/db/wal_manager.cc +3 -2
  174. package/deps/rocksdb/rocksdb/db/wal_manager.h +0 -1
  175. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +0 -1
  176. package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +1 -0
  177. package/deps/rocksdb/rocksdb/db/write_batch.cc +22 -8
  178. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +5 -4
  179. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +7 -6
  180. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -4
  181. package/deps/rocksdb/rocksdb/db/write_thread.h +3 -3
  182. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +13 -5
  183. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +9 -2
  184. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +39 -0
  185. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +65 -0
  186. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +45 -22
  187. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +7 -4
  188. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +22 -5
  189. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +28 -3
  190. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -38
  191. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -3
  192. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +80 -32
  193. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +51 -2
  194. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +23 -1
  195. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +305 -15
  196. package/deps/rocksdb/rocksdb/env/env.cc +32 -2
  197. package/deps/rocksdb/rocksdb/env/env_encryption.cc +0 -2
  198. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +2 -4
  199. package/deps/rocksdb/rocksdb/env/env_posix.cc +4 -2
  200. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -1
  201. package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -11
  202. package/deps/rocksdb/rocksdb/env/fs_readonly.h +0 -2
  203. package/deps/rocksdb/rocksdb/env/fs_remap.cc +0 -2
  204. package/deps/rocksdb/rocksdb/env/fs_remap.h +0 -2
  205. package/deps/rocksdb/rocksdb/env/io_posix.cc +6 -4
  206. package/deps/rocksdb/rocksdb/env/io_posix.h +3 -2
  207. package/deps/rocksdb/rocksdb/env/mock_env.cc +0 -1
  208. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +2 -2
  209. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +0 -2
  210. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +0 -2
  211. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +30 -21
  212. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +16 -0
  213. package/deps/rocksdb/rocksdb/file/file_util.cc +32 -14
  214. package/deps/rocksdb/rocksdb/file/file_util.h +22 -5
  215. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +229 -76
  216. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +21 -12
  217. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +10 -7
  218. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +12 -8
  219. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +1 -2
  220. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +0 -2
  221. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +3 -3
  222. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +598 -0
  223. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_iterator.h +36 -0
  224. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +70 -11
  225. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +232 -11
  226. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -1
  227. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -1
  228. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +149 -15
  229. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +17 -2
  230. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +132 -34
  231. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +158 -79
  232. package/deps/rocksdb/rocksdb/include/rocksdb/db_bench_tool.h +2 -1
  233. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -5
  234. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +1 -3
  235. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
  236. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +275 -0
  237. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +2 -1
  238. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +50 -5
  239. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +10 -0
  240. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +13 -0
  241. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +0 -1
  242. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +5 -2
  243. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +13 -0
  244. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +237 -0
  245. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +230 -39
  246. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +15 -0
  247. package/deps/rocksdb/rocksdb/include/rocksdb/perf_level.h +31 -11
  248. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +41 -0
  249. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +1 -1
  250. package/deps/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h +0 -1
  251. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +5 -1
  252. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +0 -1
  253. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -3
  254. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +2 -0
  255. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -8
  256. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +19 -2
  257. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -1
  258. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +124 -0
  259. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
  260. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +26 -1
  261. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +55 -6
  262. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +3 -5
  263. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +0 -2
  264. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -2
  265. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +0 -1
  266. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +1 -2
  267. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +0 -1
  268. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +96 -8
  269. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index_faiss.h +117 -0
  270. package/deps/rocksdb/rocksdb/{utilities/secondary_index/faiss_ivf_index.h → include/rocksdb/utilities/secondary_index_simple.h} +11 -14
  271. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +26 -11
  272. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +16 -3
  273. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -2
  274. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +63 -7
  275. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +0 -1
  276. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +28 -12
  277. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +3 -3
  278. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +0 -2
  279. package/deps/rocksdb/rocksdb/logging/event_logger_test.cc +1 -2
  280. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +1 -1
  281. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +0 -1
  282. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +0 -1
  283. package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +3 -1
  284. package/deps/rocksdb/rocksdb/memtable/skiplist.h +2 -2
  285. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +2 -4
  286. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +69 -8
  287. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +32 -9
  288. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +58 -45
  289. package/deps/rocksdb/rocksdb/monitoring/histogram.h +1 -1
  290. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -3
  291. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +5 -0
  292. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +1 -1
  293. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +3 -2
  294. package/deps/rocksdb/rocksdb/options/cf_options.cc +44 -13
  295. package/deps/rocksdb/rocksdb/options/cf_options.h +21 -7
  296. package/deps/rocksdb/rocksdb/options/configurable.cc +5 -5
  297. package/deps/rocksdb/rocksdb/options/configurable_test.h +1 -2
  298. package/deps/rocksdb/rocksdb/options/customizable.cc +0 -1
  299. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -11
  300. package/deps/rocksdb/rocksdb/options/db_options.cc +18 -15
  301. package/deps/rocksdb/rocksdb/options/db_options.h +2 -2
  302. package/deps/rocksdb/rocksdb/options/options.cc +296 -305
  303. package/deps/rocksdb/rocksdb/options/options_helper.cc +188 -62
  304. package/deps/rocksdb/rocksdb/options/options_helper.h +3 -3
  305. package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -4
  306. package/deps/rocksdb/rocksdb/options/options_parser.h +0 -1
  307. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +17 -4
  308. package/deps/rocksdb/rocksdb/options/options_test.cc +101 -76
  309. package/deps/rocksdb/rocksdb/port/lang.h +2 -1
  310. package/deps/rocksdb/rocksdb/port/port_posix.cc +2 -1
  311. package/deps/rocksdb/rocksdb/port/stack_trace.cc +5 -4
  312. package/deps/rocksdb/rocksdb/port/win/env_win.cc +3 -2
  313. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +99 -1
  314. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +6 -0
  315. package/deps/rocksdb/rocksdb/src.mk +17 -11
  316. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +0 -1
  317. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1094 -929
  318. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +6 -19
  319. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +76 -22
  320. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +2 -0
  321. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +221 -131
  322. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +12 -9
  323. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +23 -24
  324. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -38
  325. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +7 -4
  326. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +5 -5
  327. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +10 -12
  328. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -4
  329. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +35 -43
  330. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +2 -1
  331. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -1
  332. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -2
  333. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -4
  334. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +0 -1
  335. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +3 -3
  336. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -3
  337. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -4
  338. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  339. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -5
  340. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +4 -4
  341. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +37 -35
  342. package/deps/rocksdb/rocksdb/table/block_fetcher.h +11 -7
  343. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +4 -3
  344. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +31 -5
  345. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +2 -1
  346. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +0 -1
  347. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +0 -1
  348. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +0 -1
  349. package/deps/rocksdb/rocksdb/table/external_table.cc +483 -0
  350. package/deps/rocksdb/rocksdb/table/format.cc +62 -44
  351. package/deps/rocksdb/rocksdb/table/format.h +35 -12
  352. package/deps/rocksdb/rocksdb/table/internal_iterator.h +3 -13
  353. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
  354. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +6 -0
  355. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +150 -141
  356. package/deps/rocksdb/rocksdb/table/meta_blocks.h +5 -0
  357. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -2
  358. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +8 -0
  359. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +0 -1
  360. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +0 -2
  361. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +0 -2
  362. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +0 -1
  363. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +6 -6
  364. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  365. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +86 -7
  366. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +88 -2
  367. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +0 -1
  368. package/deps/rocksdb/rocksdb/table/table_builder.h +10 -1
  369. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +3 -2
  370. package/deps/rocksdb/rocksdb/table/table_test.cc +899 -22
  371. package/deps/rocksdb/rocksdb/test_util/testutil.cc +3 -4
  372. package/deps/rocksdb/rocksdb/test_util/testutil.h +132 -1
  373. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +0 -1
  374. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.h +0 -2
  375. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +163 -77
  376. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +0 -2
  377. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +0 -1
  378. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +0 -1
  379. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +120 -52
  380. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -0
  381. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
  382. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +0 -2
  383. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +2 -2
  384. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +0 -2
  385. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +2 -1
  386. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +94 -0
  387. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +0 -1
  388. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +0 -1
  389. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +1 -1
  390. package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +2 -1
  391. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +3 -5
  392. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  393. package/deps/rocksdb/rocksdb/util/async_file_reader.h +15 -8
  394. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +131 -0
  395. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +90 -0
  396. package/deps/rocksdb/rocksdb/util/autovector.h +1 -1
  397. package/deps/rocksdb/rocksdb/util/autovector_test.cc +2 -2
  398. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +0 -2
  399. package/deps/rocksdb/rocksdb/util/compression.cc +936 -4
  400. package/deps/rocksdb/rocksdb/util/compression.h +348 -232
  401. package/deps/rocksdb/rocksdb/util/compression_test.cc +229 -0
  402. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +10 -10
  403. package/deps/rocksdb/rocksdb/util/crc32c_ppc.c +1 -0
  404. package/deps/rocksdb/rocksdb/util/data_structure.cc +2 -0
  405. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -3
  406. package/deps/rocksdb/rocksdb/util/ppc-opcode.h +5 -5
  407. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +108 -0
  408. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +67 -0
  409. package/deps/rocksdb/rocksdb/util/slice_test.cc +83 -0
  410. package/deps/rocksdb/rocksdb/util/string_util.cc +0 -2
  411. package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
  412. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
  413. package/deps/rocksdb/rocksdb/util/udt_util.cc +18 -5
  414. package/deps/rocksdb/rocksdb/util/udt_util.h +10 -7
  415. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +650 -154
  416. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +438 -144
  417. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +0 -1
  418. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +0 -1
  419. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +0 -1
  420. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +16 -17
  421. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +2 -1
  422. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +0 -1
  423. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +0 -1
  424. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +7 -8
  425. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +4 -3
  426. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h +0 -1
  427. package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +0 -1
  428. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +2 -2
  429. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  430. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +0 -48
  431. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -1
  432. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +0 -1
  433. package/deps/rocksdb/rocksdb/utilities/debug.cc +7 -14
  434. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +0 -1
  435. package/deps/rocksdb/rocksdb/utilities/env_mirror_test.cc +0 -2
  436. package/deps/rocksdb/rocksdb/utilities/env_timed.cc +0 -1
  437. package/deps/rocksdb/rocksdb/utilities/env_timed_test.cc +0 -2
  438. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +5 -3
  439. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +10 -9
  440. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +0 -1
  441. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +0 -1
  442. package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +1 -0
  443. package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +0 -2
  444. package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +0 -1
  445. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +0 -1
  446. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +0 -1
  447. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +0 -2
  448. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +0 -2
  449. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc +0 -1
  450. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +0 -2
  451. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table.h +0 -2
  452. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +0 -2
  453. package/deps/rocksdb/rocksdb/utilities/persistent_cache/lrulist.h +0 -2
  454. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h +0 -2
  455. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc +0 -1
  456. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +0 -2
  457. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +0 -1
  458. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +0 -2
  459. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +183 -32
  460. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +258 -12
  461. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_helper.h +33 -0
  462. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_iterator.cc +99 -0
  463. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +280 -120
  464. package/deps/rocksdb/rocksdb/utilities/secondary_index/simple_secondary_index.cc +79 -0
  465. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +52 -16
  466. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h +10 -6
  467. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +55 -0
  468. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +0 -1
  469. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +0 -2
  470. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h +0 -1
  471. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +37 -12
  472. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +2 -0
  473. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +0 -2
  474. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc +0 -2
  475. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +1 -1
  476. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h +1 -1
  477. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
  478. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +2 -1
  479. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +2 -2
  480. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +0 -1
  481. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.h +0 -2
  482. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +1 -3
  483. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +36 -10
  484. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -7
  485. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +4 -5
  486. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +1 -4
  487. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +1 -2
  488. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +0 -2
  489. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.h +0 -1
  490. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1118 -37
  491. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +4 -7
  492. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +0 -2
  493. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +0 -2
  494. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +3 -3
  495. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +0 -1
  496. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +0 -2
  497. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +1 -2
  498. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +1 -2
  499. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +0 -1
  500. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +0 -3
  501. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +125 -127
  502. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +45 -23
  503. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +54 -22
  504. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +477 -58
  505. package/deps/rocksdb/rocksdb.gyp +9 -4
  506. package/index.js +50 -9
  507. package/package.json +8 -1
  508. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  509. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -74,12 +74,14 @@
74
74
  #include "options/cf_options.h"
75
75
  #include "options/options_helper.h"
76
76
  #include "options/options_parser.h"
77
+ #include "util/udt_util.h"
77
78
  #ifdef ROCKSDB_JEMALLOC
78
79
  #include "port/jemalloc_helper.h"
79
80
  #endif
80
81
  #include "port/port.h"
81
82
  #include "rocksdb/cache.h"
82
83
  #include "rocksdb/compaction_filter.h"
84
+ #include "rocksdb/convenience.h"
83
85
  #include "rocksdb/db.h"
84
86
  #include "rocksdb/env.h"
85
87
  #include "rocksdb/merge_operator.h"
@@ -168,7 +170,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
168
170
  bool read_only)
169
171
  : dbname_(dbname),
170
172
  own_info_log_(options.info_log == nullptr),
171
- init_logger_creation_s_(),
172
173
  initial_db_options_(SanitizeOptions(dbname, options, read_only,
173
174
  &init_logger_creation_s_)),
174
175
  env_(initial_db_options_.env),
@@ -184,7 +185,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
184
185
  mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
185
186
  immutable_db_options_.use_adaptive_mutex),
186
187
  #endif // COERCE_CONTEXT_SWITCH
187
- default_cf_handle_(nullptr),
188
188
  error_handler_(this, immutable_db_options_, &mutex_),
189
189
  event_logger_(immutable_db_options_.info_log.get()),
190
190
  max_total_in_memory_state_(0),
@@ -193,45 +193,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
193
193
  file_options_, immutable_db_options_)),
194
194
  seq_per_batch_(seq_per_batch),
195
195
  batch_per_txn_(batch_per_txn),
196
- next_job_id_(1),
197
- shutting_down_(false),
198
- reject_new_background_jobs_(false),
199
- db_lock_(nullptr),
200
- manual_compaction_paused_(false),
201
196
  bg_cv_(&mutex_),
202
- logfile_number_(0),
203
- log_dir_synced_(false),
204
- log_empty_(true),
205
- persist_stats_cf_handle_(nullptr),
206
- log_sync_cv_(&log_write_mutex_),
207
- total_log_size_(0),
208
- is_snapshot_supported_(true),
197
+ wal_sync_cv_(&wal_write_mutex_),
209
198
  write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
210
199
  write_thread_(immutable_db_options_),
211
200
  nonmem_write_thread_(immutable_db_options_),
212
201
  write_controller_(mutable_db_options_.delayed_write_rate),
213
- last_batch_group_size_(0),
214
- unscheduled_flushes_(0),
215
- unscheduled_compactions_(0),
216
- bg_bottom_compaction_scheduled_(0),
217
- bg_compaction_scheduled_(0),
218
- num_running_compactions_(0),
219
- bg_flush_scheduled_(0),
220
- num_running_flushes_(0),
221
- bg_purge_scheduled_(0),
222
- disable_delete_obsolete_files_(0),
223
- pending_purge_obsolete_files_(0),
224
202
  delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
225
- has_unpersisted_data_(false),
226
- unable_to_release_oldest_log_(false),
227
- num_running_ingest_file_(0),
228
203
  wal_manager_(immutable_db_options_, file_options_, io_tracer_,
229
204
  seq_per_batch),
230
- bg_work_paused_(0),
231
- bg_compaction_paused_(0),
232
- refitting_level_(false),
233
- opened_successfully_(false),
234
- periodic_task_scheduler_(),
235
205
  two_write_queues_(options.two_write_queues),
236
206
  manual_wal_flush_(options.manual_wal_flush),
237
207
  // last_sequencee_ is always maintained by the main queue that also writes
@@ -249,14 +219,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
249
219
  // requires a custom gc for compaction, we use that to set use_custom_gc_
250
220
  // as well.
251
221
  use_custom_gc_(seq_per_batch),
252
- shutdown_initiated_(false),
253
222
  own_sfm_(options.sst_file_manager == nullptr),
254
- closed_(false),
255
223
  atomic_flush_install_cv_(&mutex_),
256
224
  blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
257
225
  &error_handler_, &event_logger_,
258
- immutable_db_options_.listeners, dbname_),
259
- lock_wal_count_(0) {
226
+ immutable_db_options_.listeners, dbname_) {
260
227
  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
261
228
  // WriteUnprepared, which should use seq_per_batch_.
262
229
  assert(batch_per_txn_ || seq_per_batch_);
@@ -284,9 +251,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
284
251
  periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
285
252
  [this]() { this->FlushInfoLog(); });
286
253
  periodic_task_functions_.emplace(
287
- PeriodicTaskType::kRecordSeqnoTime, [this]() {
288
- this->RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
289
- });
254
+ PeriodicTaskType::kRecordSeqnoTime,
255
+ [this]() { this->RecordSeqnoToTimeMapping(); });
256
+ periodic_task_functions_.emplace(
257
+ PeriodicTaskType::kTriggerCompaction,
258
+ [this]() { this->TriggerPeriodicCompaction(); });
290
259
 
291
260
  versions_.reset(new VersionSet(
292
261
  dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
@@ -386,9 +355,8 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
386
355
  static_cast_with_check<ColumnFamilyHandleImpl>(default_cf_handle_);
387
356
  assert(cfh);
388
357
  ColumnFamilyData* cfd = cfh->cfd();
389
- const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions();
390
- s = versions_->LogAndApply(cfd, cf_opts, read_options, write_options,
391
- &edit, &mutex_, directories_.GetDbDir());
358
+ s = versions_->LogAndApply(cfd, read_options, write_options, &edit,
359
+ &mutex_, directories_.GetDbDir());
392
360
  if (!s.ok()) {
393
361
  io_s = versions_->io_status();
394
362
  if (!io_s.ok()) {
@@ -418,26 +386,26 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
418
386
  }
419
387
  }
420
388
 
421
- if (s.ok()) {
422
- // This will notify and unblock threads waiting for error recovery to
423
- // finish. Those previouly waiting threads can now proceed, which may
424
- // include closing the db.
425
- s = error_handler_.ClearBGError();
426
- } else {
427
- // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
428
- // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
429
- // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
430
- error_handler_.GetRecoveryError().PermitUncheckedError();
431
- }
432
-
433
389
  JobContext job_context(0);
434
390
  FindObsoleteFiles(&job_context, true);
435
391
  mutex_.Unlock();
392
+ // If DB shutdown initiated here, it will wait for this ongoing recovery.
436
393
  job_context.manifest_file_number = 1;
437
394
  if (job_context.HaveSomethingToDelete()) {
438
395
  PurgeObsoleteFiles(job_context);
439
396
  }
440
397
  job_context.Clean();
398
+ mutex_.Lock();
399
+
400
+ if (s.ok()) {
401
+ // Will notify and unblock threads waiting for error recovery to finish.
402
+ s = error_handler_.ClearBGError();
403
+ } else {
404
+ // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
405
+ // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
406
+ // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
407
+ error_handler_.GetRecoveryError().PermitUncheckedError();
408
+ }
441
409
 
442
410
  if (s.ok()) {
443
411
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
@@ -446,7 +414,6 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
446
414
  s.ToString().c_str());
447
415
  }
448
416
 
449
- mutex_.Lock();
450
417
  // Check for shutdown again before scheduling further compactions,
451
418
  // since we released and re-acquired the lock above
452
419
  if (shutdown_initiated_) {
@@ -509,6 +476,11 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
509
476
  s.PermitUncheckedError(); //**TODO: What to do on error?
510
477
  }
511
478
 
479
+ // Cancel awaiting remote compactions
480
+ if (immutable_db_options_.compaction_service) {
481
+ immutable_db_options_.compaction_service->CancelAwaitingJobs();
482
+ }
483
+
512
484
  shutting_down_.store(true, std::memory_order_release);
513
485
  bg_cv_.SignalAll();
514
486
  if (!wait) {
@@ -540,8 +512,8 @@ Status DBImpl::CloseHelper() {
540
512
  // continuing with the shutdown
541
513
  mutex_.Lock();
542
514
  shutdown_initiated_ = true;
543
- error_handler_.CancelErrorRecovery();
544
- while (error_handler_.IsRecoveryInProgress()) {
515
+ error_handler_.CancelErrorRecoveryForShutDown();
516
+ while (!error_handler_.ReadyForShutdown()) {
545
517
  bg_cv_.Wait();
546
518
  }
547
519
  mutex_.Unlock();
@@ -633,8 +605,8 @@ Status DBImpl::CloseHelper() {
633
605
  mutex_.Lock();
634
606
  }
635
607
  {
636
- InstrumentedMutexLock lock(&log_write_mutex_);
637
- for (auto l : logs_to_free_) {
608
+ InstrumentedMutexLock lock(&wal_write_mutex_);
609
+ for (auto l : wals_to_free_) {
638
610
  delete l;
639
611
  }
640
612
  for (auto& log : logs_) {
@@ -818,7 +790,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
818
790
  Status s = periodic_task_scheduler_.Register(
819
791
  PeriodicTaskType::kDumpStats,
820
792
  periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
821
- mutable_db_options_.stats_dump_period_sec);
793
+ mutable_db_options_.stats_dump_period_sec,
794
+ /*run_immediately=*/true);
822
795
  if (!s.ok()) {
823
796
  return s;
824
797
  }
@@ -827,7 +800,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
827
800
  Status s = periodic_task_scheduler_.Register(
828
801
  PeriodicTaskType::kPersistStats,
829
802
  periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
830
- mutable_db_options_.stats_persist_period_sec);
803
+ mutable_db_options_.stats_persist_period_sec,
804
+ /*run_immediately=*/true);
831
805
  if (!s.ok()) {
832
806
  return s;
833
807
  }
@@ -835,64 +809,55 @@ Status DBImpl::StartPeriodicTaskScheduler() {
835
809
 
836
810
  Status s = periodic_task_scheduler_.Register(
837
811
  PeriodicTaskType::kFlushInfoLog,
838
- periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog));
812
+ periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog),
813
+ /*run_immediately=*/true);
814
+
815
+ if (s.ok()) {
816
+ s = periodic_task_scheduler_.Register(
817
+ PeriodicTaskType::kTriggerCompaction,
818
+ periodic_task_functions_.at(PeriodicTaskType::kTriggerCompaction),
819
+ /*run_immediately=*/false);
820
+ }
839
821
 
840
822
  return s;
841
823
  }
842
824
 
843
- Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
844
- const WriteOptions& write_options,
845
- bool is_new_db) {
825
+ Status DBImpl::RegisterRecordSeqnoTimeWorker() {
846
826
  options_mutex_.AssertHeld();
847
827
 
848
- uint64_t min_preserve_seconds = std::numeric_limits<uint64_t>::max();
849
- uint64_t max_preserve_seconds = std::numeric_limits<uint64_t>::min();
850
- std::vector<SuperVersionContext> sv_contexts;
828
+ // We assume InstallSuperVersionForConfigChange has already ensured suitable
829
+ // mappings are present for each relevant CF. We just need to be sure the DB's
830
+ // seqno_to_time_mapping_ and worker scheduler are appropriate for the
831
+ // combination of CF settings.
832
+
833
+ MinAndMaxPreserveSeconds preserve_info;
834
+ uint64_t seqno_time_cadence;
851
835
  {
852
836
  InstrumentedMutexLock l(&mutex_);
853
837
 
854
838
  for (auto cfd : *versions_->GetColumnFamilySet()) {
855
- auto& mopts = *cfd->GetLatestMutableCFOptions();
856
- // preserve time is the max of 2 options.
857
- uint64_t preserve_seconds =
858
- std::max(mopts.preserve_internal_time_seconds,
859
- mopts.preclude_last_level_data_seconds);
860
- if (!cfd->IsDropped() && preserve_seconds > 0) {
861
- min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
862
- max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
839
+ auto& mopts = cfd->GetLatestMutableCFOptions();
840
+ if (!cfd->IsDropped()) {
841
+ preserve_info.Combine(mopts);
863
842
  }
864
843
  }
865
- size_t old_mapping_size = seqno_to_time_mapping_.Size();
866
- if (min_preserve_seconds == std::numeric_limits<uint64_t>::max()) {
867
- // Don't track
844
+ seqno_time_cadence = preserve_info.GetRecodingCadence();
845
+ if (seqno_time_cadence == 0) {
846
+ // To return as much as possible to the feature being disabled,
847
+ // clear the existing mapping
868
848
  seqno_to_time_mapping_.SetCapacity(0);
869
849
  seqno_to_time_mapping_.SetMaxTimeSpan(UINT64_MAX);
850
+ assert(seqno_to_time_mapping_.Empty());
870
851
  } else {
871
852
  uint64_t cap = std::min(kMaxSeqnoToTimeEntries,
872
- max_preserve_seconds * kMaxSeqnoTimePairsPerCF /
873
- min_preserve_seconds);
853
+ preserve_info.max_preserve_seconds *
854
+ kMaxSeqnoTimePairsPerCF /
855
+ preserve_info.min_preserve_seconds);
874
856
  seqno_to_time_mapping_.SetCapacity(cap);
875
- seqno_to_time_mapping_.SetMaxTimeSpan(max_preserve_seconds);
876
- }
877
- if (old_mapping_size != seqno_to_time_mapping_.Size()) {
878
- InstallSeqnoToTimeMappingInSV(&sv_contexts);
857
+ seqno_to_time_mapping_.SetMaxTimeSpan(preserve_info.max_preserve_seconds);
879
858
  }
880
859
  }
881
860
 
882
- // clean up outside db mutex
883
- for (SuperVersionContext& sv_context : sv_contexts) {
884
- sv_context.Clean();
885
- }
886
- sv_contexts.clear();
887
-
888
- uint64_t seqno_time_cadence = 0;
889
- if (min_preserve_seconds != std::numeric_limits<uint64_t>::max()) {
890
- // round up to 1 when the time_duration is smaller than
891
- // kMaxSeqnoTimePairsPerCF
892
- seqno_time_cadence = (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
893
- kMaxSeqnoTimePairsPerCF;
894
- }
895
-
896
861
  TEST_SYNC_POINT_CALLBACK(
897
862
  "DBImpl::RegisterRecordSeqnoTimeWorker:BeforePeriodicTaskType", nullptr);
898
863
 
@@ -900,68 +865,10 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
900
865
  if (seqno_time_cadence == 0) {
901
866
  s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
902
867
  } else {
903
- // Before registering the periodic task, we need to be sure to fulfill two
904
- // promises:
905
- // 1) Any DB created with preserve/preclude options set from the beginning
906
- // will get pre-allocated seqnos with pre-populated time mappings back to
907
- // the times we are interested in. (This will enable future import of data
908
- // while preserving rough write time. We can only do this reliably from
909
- // DB::Open, as otherwise there could be a race between CreateColumnFamily
910
- // and the first Write to the DB, and seqno-to-time mappings need to be
911
- // monotonic.
912
- // 2) In any DB, any data written after setting preserve/preclude options
913
- // must have a reasonable time estimate (so that we can accurately place
914
- // the data), which means at least one entry in seqno_to_time_mapping_.
915
- //
916
- // FIXME: We don't currently guarantee that if the first column family with
917
- // that setting is added or configured after initial DB::Open but before
918
- // the first user Write. Fixing this causes complications with the crash
919
- // test because if DB starts without preserve/preclude option, does some
920
- // user writes but all those writes are lost in crash, then re-opens with
921
- // preserve/preclude option, it sees seqno==1 which looks like one of the
922
- // user writes was recovered, when actually it was not.
923
- bool last_seqno_zero = GetLatestSequenceNumber() == 0;
924
- assert(!is_new_db || last_seqno_zero);
925
- if (is_new_db && last_seqno_zero) {
926
- // Pre-allocate seqnos and pre-populate historical mapping
927
- // We can simply modify these, before writes are allowed
928
- constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
929
- versions_->SetLastAllocatedSequence(kMax);
930
- versions_->SetLastPublishedSequence(kMax);
931
- versions_->SetLastSequence(kMax);
932
-
933
- // And record in manifest, to avoid going backwards in seqno on re-open
934
- // (potentially with different options). Concurrency is simple because we
935
- // are in DB::Open
936
- {
937
- InstrumentedMutexLock l(&mutex_);
938
- VersionEdit edit;
939
- edit.SetLastSequence(kMax);
940
- s = versions_->LogAndApplyToDefaultColumnFamily(
941
- read_options, write_options, &edit, &mutex_,
942
- directories_.GetDbDir());
943
- if (!s.ok() && versions_->io_status().IsIOError()) {
944
- error_handler_.SetBGError(versions_->io_status(),
945
- BackgroundErrorReason::kManifestWrite);
946
- }
947
- }
948
-
949
- // Pre-populate mappings for reserved sequence numbers.
950
- RecordSeqnoToTimeMapping(max_preserve_seconds);
951
- } else {
952
- if (!last_seqno_zero) {
953
- // Ensure at least one mapping (or log a warning), and
954
- // an updated entry whenever relevant SetOptions is called
955
- RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
956
- } else {
957
- // FIXME (see limitation described above)
958
- }
959
- }
960
-
961
868
  s = periodic_task_scheduler_.Register(
962
869
  PeriodicTaskType::kRecordSeqnoTime,
963
870
  periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
964
- seqno_time_cadence);
871
+ seqno_time_cadence, /*run_immediately=*/true);
965
872
  }
966
873
 
967
874
  return s;
@@ -1167,7 +1074,7 @@ void DBImpl::DumpStats() {
1167
1074
  }
1168
1075
 
1169
1076
  auto* table_factory =
1170
- cfd->GetCurrentMutableCFOptions()->table_factory.get();
1077
+ cfd->GetCurrentMutableCFOptions().table_factory.get();
1171
1078
  assert(table_factory != nullptr);
1172
1079
  // FIXME: need to a shared_ptr if/when block_cache is going to be mutable
1173
1080
  Cache* cache =
@@ -1252,11 +1159,11 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
1252
1159
 
1253
1160
  void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
1254
1161
  mutex_.AssertHeld();
1255
- if (!job_context->logs_to_free.empty()) {
1256
- for (auto l : job_context->logs_to_free) {
1162
+ if (!job_context->wals_to_free.empty()) {
1163
+ for (auto l : job_context->wals_to_free) {
1257
1164
  AddToLogsToFreeQueue(l);
1258
1165
  }
1259
- job_context->logs_to_free.clear();
1166
+ job_context->wals_to_free.clear();
1260
1167
  }
1261
1168
  }
1262
1169
 
@@ -1286,36 +1193,72 @@ Status DBImpl::SetOptions(
1286
1193
  }
1287
1194
 
1288
1195
  InstrumentedMutexLock ol(&options_mutex_);
1289
- MutableCFOptions new_options;
1196
+ MutableCFOptions new_options_copy; // For logging outside of DB mutex
1290
1197
  Status s;
1291
1198
  Status persist_options_status;
1292
1199
  SuperVersionContext sv_context(/* create_superversion */ true);
1293
1200
  {
1294
1201
  auto db_options = GetDBOptions();
1295
1202
  InstrumentedMutexLock l(&mutex_);
1296
- s = cfd->SetOptions(db_options, options_map);
1203
+ // Manifest writers + Version appenders like flush and compaction use
1204
+ // LogAndApply, which releases DB mutex to wait for other manifest writers
1205
+ // and for the manifest write. We need to append a Version for the options
1206
+ // to take full effect (e.g. compaction scores), but we don't want to
1207
+ // interleave with other callers of LogAndApply, which could at least
1208
+ // temporarily roll back option changes. Thus, we use a special call to
1209
+ // LogAndApply that allows us to
1210
+ //
1211
+ // (a) Apply the options update when we know we are the exclusive version
1212
+ // appender + (fake) manifest writer, and
1213
+ //
1214
+ // (b) Append a new Version without manifest write nor DB mutex release
1215
+ //
1216
+ // Thus aren't releasing the DB mutex from LogAndApply calling pre_cb,
1217
+ // through installing the new Version until the end of this block, after
1218
+ // installing the new SuperVersion.
1219
+ auto pre_cb = [&]() -> Status {
1220
+ Status cb_s = cfd->SetOptions(db_options, options_map);
1221
+ if (cb_s.ok()) {
1222
+ new_options_copy = cfd->GetLatestMutableCFOptions();
1223
+ }
1224
+ return cb_s;
1225
+ };
1226
+ VersionEdit dummy_edit;
1227
+ dummy_edit.MarkNoManifestWriteDummy();
1228
+ TEST_SYNC_POINT_CALLBACK("DBImpl::SetOptions:dummy_edit", &dummy_edit);
1229
+ s = versions_->LogAndApply(
1230
+ cfd, read_options, write_options, &dummy_edit, &mutex_,
1231
+ directories_.GetDbDir(), false /*new_descriptor_log=*/,
1232
+ nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb);
1233
+ if (!versions_->io_status().ok()) {
1234
+ assert(!s.ok());
1235
+ error_handler_.SetBGError(versions_->io_status(),
1236
+ BackgroundErrorReason::kManifestWrite);
1237
+ }
1238
+
1297
1239
  if (s.ok()) {
1298
- new_options = *cfd->GetLatestMutableCFOptions();
1299
- // Append new version to recompute compaction score.
1300
- VersionEdit dummy_edit;
1301
- s = versions_->LogAndApply(cfd, new_options, read_options, write_options,
1302
- &dummy_edit, &mutex_, directories_.GetDbDir());
1303
1240
  // Trigger possible flush/compactions. This has to be before we persist
1304
1241
  // options to file, otherwise there will be a deadlock with writer
1305
1242
  // thread.
1306
- InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
1307
-
1243
+ InstallSuperVersionForConfigChange(cfd, &sv_context);
1308
1244
  persist_options_status =
1309
1245
  WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
1310
1246
  bg_cv_.SignalAll();
1247
+
1248
+ #if __cplusplus >= 202002L
1249
+ assert(new_options_copy == cfd->GetLatestMutableCFOptions());
1250
+ assert(cfd->GetLatestMutableCFOptions() ==
1251
+ cfd->GetCurrentMutableCFOptions());
1252
+ assert(cfd->GetCurrentMutableCFOptions() ==
1253
+ cfd->current()->GetMutableCFOptions());
1254
+ #endif
1311
1255
  }
1312
1256
  }
1313
1257
  sv_context.Clean();
1314
1258
 
1315
1259
  if (s.ok() && (options_map.count("preserve_internal_time_seconds") > 0 ||
1316
1260
  options_map.count("preclude_last_level_data_seconds") > 0)) {
1317
- s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
1318
- false /* is_new_db*/);
1261
+ s = RegisterRecordSeqnoTimeWorker();
1319
1262
  }
1320
1263
 
1321
1264
  ROCKS_LOG_INFO(
@@ -1328,7 +1271,7 @@ Status DBImpl::SetOptions(
1328
1271
  if (s.ok()) {
1329
1272
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
1330
1273
  "[%s] SetOptions() succeeded", cfd->GetName().c_str());
1331
- new_options.Dump(immutable_db_options_.info_log.get());
1274
+ new_options_copy.Dump(immutable_db_options_.info_log.get());
1332
1275
  if (!persist_options_status.ok()) {
1333
1276
  // NOTE: WriteOptionsFile already logs on failure
1334
1277
  s = persist_options_status;
@@ -1435,7 +1378,7 @@ Status DBImpl::SetDBOptions(
1435
1378
  s = periodic_task_scheduler_.Register(
1436
1379
  PeriodicTaskType::kDumpStats,
1437
1380
  periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
1438
- new_options.stats_dump_period_sec);
1381
+ new_options.stats_dump_period_sec, /*run_immediately=*/true);
1439
1382
  }
1440
1383
  if (new_options.max_total_wal_size !=
1441
1384
  mutable_db_options_.max_total_wal_size) {
@@ -1450,7 +1393,7 @@ Status DBImpl::SetDBOptions(
1450
1393
  s = periodic_task_scheduler_.Register(
1451
1394
  PeriodicTaskType::kPersistStats,
1452
1395
  periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
1453
- new_options.stats_persist_period_sec);
1396
+ new_options.stats_persist_period_sec, /*run_immediately=*/true);
1454
1397
  }
1455
1398
  }
1456
1399
  mutex_.Lock();
@@ -1479,7 +1422,7 @@ Status DBImpl::SetDBOptions(
1479
1422
  WriteThread::Writer w;
1480
1423
  write_thread_.EnterUnbatched(&w, &mutex_);
1481
1424
  if (wal_other_option_changed ||
1482
- total_log_size_ > GetMaxTotalWalSize()) {
1425
+ wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize()) {
1483
1426
  Status purge_wal_status = SwitchWAL(&write_context);
1484
1427
  if (!purge_wal_status.ok()) {
1485
1428
  ROCKS_LOG_WARN(immutable_db_options_.info_log,
@@ -1506,14 +1449,9 @@ Status DBImpl::SetDBOptions(
1506
1449
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
1507
1450
  new_options.Dump(immutable_db_options_.info_log.get());
1508
1451
  if (!persist_options_status.ok()) {
1509
- if (immutable_db_options_.fail_if_options_file_error) {
1510
- s = Status::IOError(
1511
- "SetDBOptions() succeeded, but unable to persist options",
1512
- persist_options_status.ToString());
1513
- }
1514
- ROCKS_LOG_WARN(immutable_db_options_.info_log,
1515
- "Unable to persist options in SetDBOptions() -- %s",
1516
- persist_options_status.ToString().c_str());
1452
+ s = Status::IOError(
1453
+ "SetDBOptions() succeeded, but unable to persist options",
1454
+ persist_options_status.ToString());
1517
1455
  }
1518
1456
  } else {
1519
1457
  ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
@@ -1548,8 +1486,8 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
1548
1486
  if (manual_wal_flush_) {
1549
1487
  IOStatus io_s;
1550
1488
  {
1551
- // We need to lock log_write_mutex_ since logs_ might change concurrently
1552
- InstrumentedMutexLock wl(&log_write_mutex_);
1489
+ // We need to lock wal_write_mutex_ since logs_ might change concurrently
1490
+ InstrumentedMutexLock wl(&wal_write_mutex_);
1553
1491
  log::Writer* cur_log_writer = logs_.back().writer;
1554
1492
  io_s = cur_log_writer->WriteBuffer(write_options);
1555
1493
  }
@@ -1576,7 +1514,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
1576
1514
  }
1577
1515
 
1578
1516
  bool DBImpl::WALBufferIsEmpty() {
1579
- InstrumentedMutexLock l(&log_write_mutex_);
1517
+ InstrumentedMutexLock l(&wal_write_mutex_);
1580
1518
  log::Writer* cur_log_writer = logs_.back().writer;
1581
1519
  auto res = cur_log_writer->BufferIsEmpty();
1582
1520
  return res;
@@ -1584,7 +1522,7 @@ bool DBImpl::WALBufferIsEmpty() {
1584
1522
 
1585
1523
  Status DBImpl::GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size) {
1586
1524
  assert(number_to_size.empty());
1587
- InstrumentedMutexLock l(&log_write_mutex_);
1525
+ InstrumentedMutexLock l(&wal_write_mutex_);
1588
1526
  for (auto& log : logs_) {
1589
1527
  auto* open_file = log.writer->file();
1590
1528
  if (open_file) {
@@ -1626,15 +1564,15 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
1626
1564
  uint64_t up_to_number;
1627
1565
 
1628
1566
  {
1629
- InstrumentedMutexLock l(&log_write_mutex_);
1567
+ InstrumentedMutexLock l(&wal_write_mutex_);
1630
1568
  assert(!logs_.empty());
1631
1569
 
1632
- maybe_active_number = logfile_number_;
1570
+ maybe_active_number = cur_wal_number_;
1633
1571
  up_to_number =
1634
1572
  include_current_wal ? maybe_active_number : maybe_active_number - 1;
1635
1573
 
1636
1574
  while (logs_.front().number <= up_to_number && logs_.front().IsSyncing()) {
1637
- log_sync_cv_.Wait();
1575
+ wal_sync_cv_.Wait();
1638
1576
  }
1639
1577
  // First check that logs are safe to sync in background.
1640
1578
  if (include_current_wal &&
@@ -1658,7 +1596,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
1658
1596
  }
1659
1597
  }
1660
1598
 
1661
- need_wal_dir_sync = !log_dir_synced_;
1599
+ need_wal_dir_sync = !wal_dir_synced_;
1662
1600
  }
1663
1601
 
1664
1602
  if (include_current_wal) {
@@ -1731,7 +1669,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
1731
1669
  /*arg=*/nullptr);
1732
1670
  }
1733
1671
  {
1734
- InstrumentedMutexLock l(&log_write_mutex_);
1672
+ InstrumentedMutexLock l(&wal_write_mutex_);
1735
1673
  for (auto* wal : wals_internally_closed) {
1736
1674
  // We can only modify the state of log::Writer under the mutex
1737
1675
  bool was_closed = wal->PublishIfClosed();
@@ -1848,9 +1786,9 @@ Status DBImpl::UnlockWAL() {
1848
1786
 
1849
1787
  void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
1850
1788
  VersionEdit* synced_wals) {
1851
- log_write_mutex_.AssertHeld();
1852
- if (synced_dir && logfile_number_ == up_to) {
1853
- log_dir_synced_ = true;
1789
+ wal_write_mutex_.AssertHeld();
1790
+ if (synced_dir && cur_wal_number_ == up_to) {
1791
+ wal_dir_synced_ = true;
1854
1792
  }
1855
1793
  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
1856
1794
  auto& wal = *it;
@@ -1872,7 +1810,7 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
1872
1810
  (immutable_db_options_.background_close_inactive_wals &&
1873
1811
  wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize())) {
1874
1812
  // Fully synced
1875
- logs_to_free_.push_back(wal.ReleaseWriter());
1813
+ wals_to_free_.push_back(wal.ReleaseWriter());
1876
1814
  it = logs_.erase(it);
1877
1815
  } else {
1878
1816
  wal.FinishSync();
@@ -1885,17 +1823,17 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
1885
1823
  ++it;
1886
1824
  }
1887
1825
  }
1888
- log_sync_cv_.SignalAll();
1826
+ wal_sync_cv_.SignalAll();
1889
1827
  }
1890
1828
 
1891
1829
  void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
1892
- log_write_mutex_.AssertHeld();
1830
+ wal_write_mutex_.AssertHeld();
1893
1831
  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
1894
1832
  ++it) {
1895
1833
  auto& wal = *it;
1896
1834
  wal.FinishSync();
1897
1835
  }
1898
- log_sync_cv_.SignalAll();
1836
+ wal_sync_cv_.SignalAll();
1899
1837
  }
1900
1838
 
1901
1839
  SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@@ -1931,6 +1869,69 @@ Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
1931
1869
  return Status::OK();
1932
1870
  }
1933
1871
 
1872
+ Status DBImpl::GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
1873
+ std::string* newest_timestamp) {
1874
+ if (newest_timestamp == nullptr) {
1875
+ return Status::InvalidArgument("newest_timestamp is nullptr");
1876
+ }
1877
+ ColumnFamilyData* cfd = nullptr;
1878
+ if (column_family == nullptr) {
1879
+ cfd = default_cf_handle_->cfd();
1880
+ } else {
1881
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
1882
+ assert(cfh != nullptr);
1883
+ cfd = cfh->cfd();
1884
+ }
1885
+ assert(cfd != nullptr && cfd->user_comparator() != nullptr);
1886
+ if (cfd->user_comparator()->timestamp_size() == 0) {
1887
+ return Status::InvalidArgument(
1888
+ "Timestamp is not enabled in this column family");
1889
+ }
1890
+ if (cfd->ioptions().persist_user_defined_timestamps) {
1891
+ return Status::NotSupported(
1892
+ "GetNewestUserDefinedTimestamp doesn't support the case when user"
1893
+ "defined timestamps are persisted.");
1894
+ }
1895
+
1896
+ Status status;
1897
+ // Acquire SuperVersion
1898
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
1899
+ {
1900
+ InstrumentedMutexLock l(&mutex_);
1901
+ bool enter_write_thread = sv->mem == cfd->mem();
1902
+ WriteThread::Writer w;
1903
+ // Enter write thread to read the mutable memtable to avoid racing access
1904
+ // with concurrent writes. No need to enter nonmem_write_thread_ since this
1905
+ // call only care about memtable writes, not WAL writes.
1906
+ if (enter_write_thread) {
1907
+ write_thread_.EnterUnbatched(&w, &mutex_);
1908
+ WaitForPendingWrites();
1909
+ }
1910
+ *newest_timestamp = sv->mem->GetNewestUDT().ToString();
1911
+ assert(!newest_timestamp->empty() || sv->mem->IsEmpty());
1912
+ if (enter_write_thread) {
1913
+ write_thread_.ExitUnbatched(&w);
1914
+ }
1915
+ }
1916
+ // Read from immutable memtables if nothing found in mutable memtable.
1917
+ if (newest_timestamp->empty()) {
1918
+ *newest_timestamp = sv->imm->GetNewestUDT().ToString();
1919
+ }
1920
+ // Read from SST files if no result can be found in memtables.
1921
+ if (newest_timestamp->empty() && sv->current->GetSstFilesSize() != 0) {
1922
+ // full_history_ts_low is used to track the exclusive upperbound of
1923
+ // flushed user defined timestamp. So we can use it to deduce the newest
1924
+ // timestamp in the SST files that the column family has seen.
1925
+ Slice full_history_ts_low = sv->full_history_ts_low;
1926
+ if (!full_history_ts_low.empty()) {
1927
+ GetU64CutoffTsFromFullHistoryTsLow(&full_history_ts_low,
1928
+ newest_timestamp);
1929
+ }
1930
+ }
1931
+ ReturnAndCleanupSuperVersion(cfd, sv);
1932
+ return status;
1933
+ }
1934
+
1934
1935
  InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
1935
1936
  Arena* arena,
1936
1937
  SequenceNumber sequence,
@@ -1964,10 +1965,10 @@ void DBImpl::BackgroundCallPurge() {
1964
1965
  TEST_SYNC_POINT("DBImpl::BackgroundCallPurge:beforeMutexLock");
1965
1966
  mutex_.Lock();
1966
1967
 
1967
- while (!logs_to_free_queue_.empty()) {
1968
- assert(!logs_to_free_queue_.empty());
1969
- log::Writer* log_writer = *(logs_to_free_queue_.begin());
1970
- logs_to_free_queue_.pop_front();
1968
+ while (!wals_to_free_queue_.empty()) {
1969
+ assert(!wals_to_free_queue_.empty());
1970
+ log::Writer* log_writer = *(wals_to_free_queue_.begin());
1971
+ wals_to_free_queue_.pop_front();
1971
1972
  mutex_.Unlock();
1972
1973
  delete log_writer;
1973
1974
  mutex_.Lock();
@@ -2110,7 +2111,7 @@ InternalIterator* DBImpl::NewInternalIterator(
2110
2111
  } else {
2111
2112
  mem_tombstone_iter = std::make_unique<TruncatedRangeDelIterator>(
2112
2113
  std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
2113
- &cfd->ioptions()->internal_comparator, nullptr /* smallest */,
2114
+ &cfd->ioptions().internal_comparator, nullptr /* smallest */,
2114
2115
  nullptr /* largest */);
2115
2116
  }
2116
2117
  merge_iter_builder.AddPointAndTombstoneIterator(
@@ -2559,6 +2560,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
2559
2560
  // Return all merge operands for get_impl_options.key
2560
2561
  *get_impl_options.number_of_operands =
2561
2562
  static_cast<int>(merge_context.GetNumOperands());
2563
+ // OK status is returned, some merge operand is found.
2564
+ assert(*get_impl_options.number_of_operands > 0);
2562
2565
  if (*get_impl_options.number_of_operands >
2563
2566
  get_impl_options.get_merge_operands_options
2564
2567
  ->expected_max_number_of_operands) {
@@ -2663,7 +2666,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
2663
2666
  }
2664
2667
  };
2665
2668
 
2666
- bool last_try = false;
2669
+ bool acquire_mutex = false;
2667
2670
  if (cf_list->size() == 1) {
2668
2671
  // Fast path for a single column family. We can simply get the thread local
2669
2672
  // super version
@@ -2712,29 +2715,32 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
2712
2715
  // sure.
2713
2716
  constexpr int num_retries = 3;
2714
2717
  for (int i = 0; i < num_retries; ++i) {
2715
- last_try = (i == num_retries - 1);
2718
+ // When reading from kPersistedTier, we want a consistent view into CFs.
2719
+ // So we take mutex to prevent any SV change in any CF.
2720
+ acquire_mutex = ((i == num_retries - 1) && !read_options.snapshot) ||
2721
+ read_options.read_tier == kPersistedTier;
2716
2722
  bool retry = false;
2717
2723
 
2718
2724
  if (i > 0) {
2719
2725
  sv_cleanup_func();
2720
2726
  }
2721
2727
  if (read_options.snapshot == nullptr) {
2722
- if (last_try) {
2723
- TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
2724
- // We're close to max number of retries. For the last retry,
2725
- // acquire the lock so we're sure to succeed
2726
- mutex_.Lock();
2727
- }
2728
2728
  *snapshot = GetLastPublishedSequence();
2729
2729
  } else {
2730
2730
  *snapshot =
2731
2731
  static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
2732
2732
  ->number_;
2733
2733
  }
2734
+ if (acquire_mutex) {
2735
+ TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
2736
+ // We're close to max number of retries. For the last retry,
2737
+ // acquire the lock so we're sure to succeed
2738
+ mutex_.Lock();
2739
+ }
2734
2740
  for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
2735
2741
  ++cf_iter) {
2736
2742
  auto node = iter_deref_func(cf_iter);
2737
- if (!last_try) {
2743
+ if (!acquire_mutex) {
2738
2744
  if (extra_sv_ref) {
2739
2745
  node->super_version = node->cfd->GetReferencedSuperVersion(this);
2740
2746
  } else {
@@ -2758,7 +2764,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
2758
2764
  }
2759
2765
  }
2760
2766
  TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeCheckingSnapshot");
2761
- if (read_options.snapshot != nullptr || last_try) {
2767
+ if (read_options.snapshot != nullptr || acquire_mutex) {
2762
2768
  // If user passed a snapshot, then we don't care if a memtable is
2763
2769
  // sealed or compaction happens because the snapshot would ensure
2764
2770
  // that older key versions are kept around. If this is the last
@@ -2769,7 +2775,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
2769
2775
  // memtables, which will include immutable memtables as well, but that
2770
2776
  // might be tricky to maintain in case we decide, in future, to do
2771
2777
  // memtable compaction.
2772
- if (!last_try) {
2778
+ if (!acquire_mutex) {
2773
2779
  SequenceNumber seq =
2774
2780
  node->super_version->mem->GetEarliestSequenceNumber();
2775
2781
  if (seq > *snapshot) {
@@ -2779,19 +2785,20 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
2779
2785
  }
2780
2786
  }
2781
2787
  if (!retry) {
2782
- if (last_try) {
2788
+ if (acquire_mutex) {
2783
2789
  mutex_.Unlock();
2784
2790
  TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterLastTryRefSV");
2785
2791
  }
2786
2792
  break;
2787
2793
  }
2794
+ assert(!acquire_mutex);
2788
2795
  }
2789
2796
  }
2790
2797
 
2791
2798
  TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum1");
2792
2799
  TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum2");
2793
2800
  PERF_TIMER_STOP(get_snapshot_time);
2794
- *sv_from_thread_local = !last_try;
2801
+ *sv_from_thread_local = !acquire_mutex;
2795
2802
  if (!s.ok()) {
2796
2803
  sv_cleanup_func();
2797
2804
  }
@@ -3497,7 +3504,7 @@ void DBImpl::MultiGetEntityWithCallback(
3497
3504
  }
3498
3505
 
3499
3506
  Status DBImpl::WrapUpCreateColumnFamilies(
3500
- const ReadOptions& read_options, const WriteOptions& write_options,
3507
+ const WriteOptions& write_options,
3501
3508
  const std::vector<const ColumnFamilyOptions*>& cf_options) {
3502
3509
  options_mutex_.AssertHeld();
3503
3510
 
@@ -3514,8 +3521,7 @@ Status DBImpl::WrapUpCreateColumnFamilies(
3514
3521
  // Attempt both follow-up actions even if one fails
3515
3522
  Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/);
3516
3523
  if (register_worker) {
3517
- s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(read_options, write_options,
3518
- /* is_new_db */ false));
3524
+ s.UpdateIfOk(RegisterRecordSeqnoTimeWorker());
3519
3525
  }
3520
3526
  return s;
3521
3527
  }
@@ -3530,8 +3536,7 @@ Status DBImpl::CreateColumnFamily(const ReadOptions& read_options,
3530
3536
  Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options,
3531
3537
  column_family, handle);
3532
3538
  if (s.ok()) {
3533
- s.UpdateIfOk(
3534
- WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
3539
+ s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
3535
3540
  }
3536
3541
  return s;
3537
3542
  }
@@ -3558,8 +3563,7 @@ Status DBImpl::CreateColumnFamilies(
3558
3563
  success_once = true;
3559
3564
  }
3560
3565
  if (success_once) {
3561
- s.UpdateIfOk(
3562
- WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
3566
+ s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
3563
3567
  }
3564
3568
  return s;
3565
3569
  }
@@ -3589,8 +3593,7 @@ Status DBImpl::CreateColumnFamilies(
3589
3593
  cf_opts.push_back(&column_families[i].options);
3590
3594
  }
3591
3595
  if (success_once) {
3592
- s.UpdateIfOk(
3593
- WrapUpCreateColumnFamilies(read_options, write_options, cf_opts));
3596
+ s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, cf_opts));
3594
3597
  }
3595
3598
  return s;
3596
3599
  }
@@ -3631,7 +3634,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
3631
3634
  edit.AddColumnFamily(column_family_name);
3632
3635
  uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
3633
3636
  edit.SetColumnFamily(new_id);
3634
- edit.SetLogNumber(logfile_number_);
3637
+ edit.SetLogNumber(cur_wal_number_);
3635
3638
  edit.SetComparatorName(cf_options.comparator->Name());
3636
3639
  edit.SetPersistUserDefinedTimestamps(
3637
3640
  cf_options.persist_user_defined_timestamps);
@@ -3643,9 +3646,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
3643
3646
  write_thread_.EnterUnbatched(&w, &mutex_);
3644
3647
  // LogAndApply will both write the creation in MANIFEST and create
3645
3648
  // ColumnFamilyData object
3646
- s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options),
3647
- read_options, write_options, &edit, &mutex_,
3648
- directories_.GetDbDir(), false, &cf_options);
3649
+ s = versions_->LogAndApply(nullptr, read_options, write_options, &edit,
3650
+ &mutex_, directories_.GetDbDir(), false,
3651
+ &cf_options);
3649
3652
  write_thread_.ExitUnbatched(&w);
3650
3653
  }
3651
3654
  if (s.ok()) {
@@ -3659,8 +3662,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
3659
3662
  auto* cfd =
3660
3663
  versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
3661
3664
  assert(cfd != nullptr);
3662
- InstallSuperVersionAndScheduleWork(cfd, &sv_context,
3663
- *cfd->GetLatestMutableCFOptions());
3665
+ InstallSuperVersionForConfigChange(cfd, &sv_context);
3664
3666
 
3665
3667
  if (!cfd->mem()->IsSnapshotSupported()) {
3666
3668
  is_snapshot_supported_ = false;
@@ -3744,7 +3746,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
3744
3746
  Status s;
3745
3747
  // Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not
3746
3748
  // applicable
3747
- bool used_preserve_preclude = false;
3749
+ MinAndMaxPreserveSeconds preserve_info;
3748
3750
  {
3749
3751
  InstrumentedMutexLock l(&mutex_);
3750
3752
  if (cfd->IsDropped()) {
@@ -3754,17 +3756,15 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
3754
3756
  // we drop column family from a single write thread
3755
3757
  WriteThread::Writer w;
3756
3758
  write_thread_.EnterUnbatched(&w, &mutex_);
3757
- s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
3758
- read_options, write_options, &edit, &mutex_,
3759
- directories_.GetDbDir());
3759
+ s = versions_->LogAndApply(cfd, read_options, write_options, &edit,
3760
+ &mutex_, directories_.GetDbDir());
3760
3761
  write_thread_.ExitUnbatched(&w);
3761
3762
  }
3762
3763
  if (s.ok()) {
3763
- auto& moptions = *cfd->GetLatestMutableCFOptions();
3764
+ auto& moptions = cfd->GetLatestMutableCFOptions();
3764
3765
  max_total_in_memory_state_ -=
3765
3766
  moptions.write_buffer_size * moptions.max_write_buffer_number;
3766
- used_preserve_preclude = moptions.preserve_internal_time_seconds > 0 ||
3767
- moptions.preclude_last_level_data_seconds > 0;
3767
+ preserve_info.Combine(moptions);
3768
3768
  }
3769
3769
 
3770
3770
  if (!cf_support_snapshot) {
@@ -3782,9 +3782,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
3782
3782
  bg_cv_.SignalAll();
3783
3783
  }
3784
3784
 
3785
- if (used_preserve_preclude) {
3786
- s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
3787
- /* is_new_db */ false);
3785
+ if (preserve_info.IsEnabled()) {
3786
+ s = RegisterRecordSeqnoTimeWorker();
3788
3787
  }
3789
3788
 
3790
3789
  if (s.ok()) {
@@ -3834,6 +3833,16 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
3834
3833
  return s.ok() || s.IsIncomplete();
3835
3834
  }
3836
3835
 
3836
+ std::unique_ptr<MultiScan> DBImpl::NewMultiScan(
3837
+ const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
3838
+ const std::vector<ScanOptions>& scan_opts) {
3839
+ std::unique_ptr<Iterator> iter(NewIterator(_read_options, column_family));
3840
+ iter->Prepare(scan_opts);
3841
+ std::unique_ptr<MultiScan> ms_iter =
3842
+ std::make_unique<MultiScan>(scan_opts, std::move(iter));
3843
+ return ms_iter;
3844
+ }
3845
+
3837
3846
  Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
3838
3847
  ColumnFamilyHandle* column_family) {
3839
3848
  if (_read_options.io_activity != Env::IOActivity::kUnknown &&
@@ -3890,11 +3899,14 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
3890
3899
 
3891
3900
  auto iter = new ForwardIterator(this, read_options, cfd, sv,
3892
3901
  /* allow_unprepared_value */ true);
3893
- result = NewDBIterator(
3894
- env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
3895
- cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
3896
- sv->mutable_cf_options.max_sequential_skip_in_iterations,
3897
- nullptr /* read_callback */, cfh);
3902
+ // TODO(cbi): Add support for `memtable_op_scan_flush_trigger` for tailing
3903
+ // iterator. This requires refreshing DBIter's pointer to active_mem when
3904
+ // tailing iterator refreshes to new memtable internally.
3905
+ result = DBIter::NewIter(env_, read_options, cfd->ioptions(),
3906
+ sv->mutable_cf_options, cfd->user_comparator(),
3907
+ iter, sv->current, kMaxSequenceNumber,
3908
+ /*read_callback=*/nullptr, /*active_mem=*/nullptr,
3909
+ cfh, /*expose_blob_index=*/false);
3898
3910
  } else {
3899
3911
  // Note: no need to consider the special case of
3900
3912
  // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
@@ -3972,18 +3984,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(
3972
3984
  // Laying out the iterators in the order of being accessed makes it more
3973
3985
  // likely that any iterator pointer is close to the iterator it points to so
3974
3986
  // that they are likely to be in the same cache line and/or page.
3975
- ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
3976
- env_, read_options, *cfh->cfd()->ioptions(), sv->mutable_cf_options,
3977
- sv->current, snapshot,
3978
- sv->mutable_cf_options.max_sequential_skip_in_iterations,
3979
- sv->version_number, read_callback, cfh, expose_blob_index, allow_refresh);
3980
-
3981
- InternalIterator* internal_iter = NewInternalIterator(
3982
- db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), snapshot,
3983
- /* allow_unprepared_value */ true, db_iter);
3984
- db_iter->SetIterUnderDBIter(internal_iter);
3985
-
3986
- return db_iter;
3987
+ return NewArenaWrappedDbIterator(
3988
+ env_, read_options, cfh, sv, snapshot, read_callback, this,
3989
+ expose_blob_index, allow_refresh, /*allow_mark_memtable_for_flush=*/true);
3987
3990
  }
3988
3991
 
3989
3992
  std::unique_ptr<Iterator> DBImpl::NewCoalescingIterator(
@@ -4107,14 +4110,12 @@ Status DBImpl::NewIterators(
4107
4110
  auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
4108
4111
  cf_sv_pair.super_version,
4109
4112
  /* allow_unprepared_value */ true);
4110
- iterators->push_back(
4111
- NewDBIterator(env_, read_options, *cf_sv_pair.cfd->ioptions(),
4112
- cf_sv_pair.super_version->mutable_cf_options,
4113
- cf_sv_pair.cfd->user_comparator(), iter,
4114
- cf_sv_pair.super_version->current, kMaxSequenceNumber,
4115
- cf_sv_pair.super_version->mutable_cf_options
4116
- .max_sequential_skip_in_iterations,
4117
- nullptr /*read_callback*/, cf_sv_pair.cfh));
4113
+ iterators->push_back(DBIter::NewIter(
4114
+ env_, read_options, cf_sv_pair.cfd->ioptions(),
4115
+ cf_sv_pair.super_version->mutable_cf_options,
4116
+ cf_sv_pair.cfd->user_comparator(), iter,
4117
+ cf_sv_pair.super_version->current, kMaxSequenceNumber,
4118
+ nullptr /*read_callback*/, /*active_mem=*/nullptr, cf_sv_pair.cfh));
4118
4119
  }
4119
4120
  } else {
4120
4121
  for (const auto& cf_sv_pair : cf_sv_pairs) {
@@ -4346,7 +4347,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
4346
4347
  CfdList cf_scheduled;
4347
4348
  if (oldest_snapshot > bottommost_files_mark_threshold_) {
4348
4349
  for (auto* cfd : *versions_->GetColumnFamilySet()) {
4349
- if (!cfd->ioptions()->allow_ingest_behind) {
4350
+ if (!cfd->ioptions().allow_ingest_behind) {
4350
4351
  cfd->current()->storage_info()->UpdateOldestSnapshot(
4351
4352
  oldest_snapshot, /*allow_ingest_behind=*/false);
4352
4353
  if (!cfd->current()
@@ -4367,7 +4368,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
4367
4368
  SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
4368
4369
  for (auto* cfd : *versions_->GetColumnFamilySet()) {
4369
4370
  if (CfdListContains(cf_scheduled, cfd) ||
4370
- cfd->ioptions()->allow_ingest_behind) {
4371
+ cfd->ioptions().allow_ingest_behind) {
4371
4372
  continue;
4372
4373
  }
4373
4374
  new_bottommost_files_mark_threshold = std::min(
@@ -4446,7 +4447,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
4446
4447
  // Add timestamp if needed
4447
4448
  for (size_t i = 0; i < n; i++) {
4448
4449
  auto [start, limit] = MaybeAddTimestampsToRange(
4449
- &range[i].start, &range[i].limit, ts_sz, &keys.emplace_back(),
4450
+ range[i].start, range[i].limit, ts_sz, &keys.emplace_back(),
4450
4451
  &keys.emplace_back(), /*exclusive_end=*/false);
4451
4452
  assert(start.has_value());
4452
4453
  assert(limit.has_value());
@@ -4463,6 +4464,29 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
4463
4464
  return s;
4464
4465
  }
4465
4466
 
4467
+ Status DBImpl::GetPropertiesOfTablesByLevel(
4468
+ ColumnFamilyHandle* column_family,
4469
+ std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level) {
4470
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
4471
+ auto cfd = cfh->cfd();
4472
+
4473
+ // Increment the ref count
4474
+ mutex_.Lock();
4475
+ auto version = cfd->current();
4476
+ version->Ref();
4477
+ mutex_.Unlock();
4478
+
4479
+ const ReadOptions read_options;
4480
+ auto s = version->GetPropertiesOfTablesByLevel(read_options, props_by_level);
4481
+
4482
+ // Decrement the ref count
4483
+ mutex_.Lock();
4484
+ version->Unref();
4485
+ mutex_.Unlock();
4486
+
4487
+ return s;
4488
+ }
4489
+
4466
4490
  const std::string& DBImpl::GetName() const { return dbname_; }
4467
4491
 
4468
4492
  Env* DBImpl::GetEnv() const { return env_; }
@@ -4763,7 +4787,7 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
4763
4787
  // Add timestamp if needed
4764
4788
  std::string start_with_ts, limit_with_ts;
4765
4789
  auto [start, limit] = MaybeAddTimestampsToRange(
4766
- &range.start, &range.limit, ts_sz, &start_with_ts, &limit_with_ts);
4790
+ range.start, range.limit, ts_sz, &start_with_ts, &limit_with_ts);
4767
4791
  assert(start.has_value());
4768
4792
  assert(limit.has_value());
4769
4793
  // Convert user_key into a corresponding internal key.
@@ -4801,9 +4825,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
4801
4825
  for (int i = 0; i < n; i++) {
4802
4826
  // Add timestamp if needed
4803
4827
  std::string start_with_ts, limit_with_ts;
4804
- auto [start, limit] =
4805
- MaybeAddTimestampsToRange(&range[i].start, &range[i].limit, ts_sz,
4806
- &start_with_ts, &limit_with_ts);
4828
+ auto [start, limit] = MaybeAddTimestampsToRange(
4829
+ range[i].start, range[i].limit, ts_sz, &start_with_ts, &limit_with_ts);
4807
4830
  assert(start.has_value());
4808
4831
  assert(limit.has_value());
4809
4832
  // Convert user_key into a corresponding internal key.
@@ -4878,112 +4901,8 @@ Status DBImpl::GetUpdatesSince(
4878
4901
  return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
4879
4902
  }
4880
4903
 
4881
- Status DBImpl::DeleteFile(std::string name) {
4882
- // TODO: plumb Env::IOActivity, Env::IOPriority
4883
- const ReadOptions read_options;
4884
- const WriteOptions write_options;
4885
-
4886
- uint64_t number;
4887
- FileType type;
4888
- WalFileType log_type;
4889
- if (!ParseFileName(name, &number, &type, &log_type) ||
4890
- (type != kTableFile && type != kWalFile)) {
4891
- ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
4892
- name.c_str());
4893
- return Status::InvalidArgument("Invalid file name");
4894
- }
4895
-
4896
- if (type == kWalFile) {
4897
- // Only allow deleting archived log files
4898
- if (log_type != kArchivedLogFile) {
4899
- ROCKS_LOG_ERROR(immutable_db_options_.info_log,
4900
- "DeleteFile %s failed - not archived log.\n",
4901
- name.c_str());
4902
- return Status::NotSupported("Delete only supported for archived logs");
4903
- }
4904
- Status status = wal_manager_.DeleteFile(name, number);
4905
- if (!status.ok()) {
4906
- ROCKS_LOG_ERROR(immutable_db_options_.info_log,
4907
- "DeleteFile %s failed -- %s.\n", name.c_str(),
4908
- status.ToString().c_str());
4909
- }
4910
- return status;
4911
- }
4912
-
4913
- Status status;
4914
- int level;
4915
- FileMetaData* metadata;
4916
- ColumnFamilyData* cfd;
4917
- VersionEdit edit;
4918
- JobContext job_context(next_job_id_.fetch_add(1), true);
4919
- {
4920
- InstrumentedMutexLock l(&mutex_);
4921
- status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
4922
- if (!status.ok()) {
4923
- ROCKS_LOG_WARN(immutable_db_options_.info_log,
4924
- "DeleteFile %s failed. File not found\n", name.c_str());
4925
- job_context.Clean();
4926
- return Status::InvalidArgument("File not found");
4927
- }
4928
- assert(level < cfd->NumberLevels());
4929
-
4930
- // If the file is being compacted no need to delete.
4931
- if (metadata->being_compacted) {
4932
- ROCKS_LOG_INFO(immutable_db_options_.info_log,
4933
- "DeleteFile %s Skipped. File about to be compacted\n",
4934
- name.c_str());
4935
- job_context.Clean();
4936
- return Status::OK();
4937
- }
4938
-
4939
- // Only the files in the last level can be deleted externally.
4940
- // This is to make sure that any deletion tombstones are not
4941
- // lost. Check that the level passed is the last level.
4942
- auto* vstoreage = cfd->current()->storage_info();
4943
- for (int i = level + 1; i < cfd->NumberLevels(); i++) {
4944
- if (vstoreage->NumLevelFiles(i) != 0) {
4945
- ROCKS_LOG_WARN(immutable_db_options_.info_log,
4946
- "DeleteFile %s FAILED. File not in last level\n",
4947
- name.c_str());
4948
- job_context.Clean();
4949
- return Status::InvalidArgument("File not in last level");
4950
- }
4951
- }
4952
- // if level == 0, it has to be the oldest file
4953
- if (level == 0 &&
4954
- vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
4955
- ROCKS_LOG_WARN(immutable_db_options_.info_log,
4956
- "DeleteFile %s failed ---"
4957
- " target file in level 0 must be the oldest.",
4958
- name.c_str());
4959
- job_context.Clean();
4960
- return Status::InvalidArgument("File in level 0, but not oldest");
4961
- }
4962
- edit.SetColumnFamily(cfd->GetID());
4963
- edit.DeleteFile(level, number);
4964
- status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
4965
- read_options, write_options, &edit, &mutex_,
4966
- directories_.GetDbDir());
4967
- if (status.ok()) {
4968
- InstallSuperVersionAndScheduleWork(
4969
- cfd, job_context.superversion_contexts.data(),
4970
- *cfd->GetLatestMutableCFOptions());
4971
- }
4972
- FindObsoleteFiles(&job_context, false);
4973
- } // lock released here
4974
-
4975
- LogFlush(immutable_db_options_.info_log);
4976
- // remove files outside the db-lock
4977
- if (job_context.HaveSomethingToDelete()) {
4978
- // Call PurgeObsoleteFiles() without holding mutex.
4979
- PurgeObsoleteFiles(job_context);
4980
- }
4981
- job_context.Clean();
4982
- return status;
4983
- }
4984
-
4985
4904
  Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
4986
- const RangePtr* ranges, size_t n,
4905
+ const RangeOpt* ranges, size_t n,
4987
4906
  bool include_end) {
4988
4907
  // TODO: plumb Env::IOActivity, Env::IOPriority
4989
4908
  const ReadOptions read_options;
@@ -4995,7 +4914,7 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
4995
4914
  const Comparator* ucmp = cfd->user_comparator();
4996
4915
  assert(ucmp);
4997
4916
  const size_t ts_sz = ucmp->timestamp_size();
4998
- autovector<UserKeyRangePtr> ukey_ranges;
4917
+ autovector<UserKeyRangeOpt> ukey_ranges;
4999
4918
  std::vector<std::string> keys;
5000
4919
  std::vector<Slice> key_slices;
5001
4920
  ukey_ranges.reserve(n);
@@ -5005,8 +4924,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
5005
4924
  auto [start, limit] = MaybeAddTimestampsToRange(
5006
4925
  ranges[i].start, ranges[i].limit, ts_sz, &keys.emplace_back(),
5007
4926
  &keys.emplace_back(), !include_end);
5008
- assert((ranges[i].start != nullptr) == start.has_value());
5009
- assert((ranges[i].limit != nullptr) == limit.has_value());
4927
+ assert(ranges[i].start.has_value() == start.has_value());
4928
+ assert(ranges[i].limit.has_value() == limit.has_value());
5010
4929
  ukey_ranges.emplace_back(start, limit);
5011
4930
  }
5012
4931
 
@@ -5066,21 +4985,19 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
5066
4985
  }
5067
4986
  }
5068
4987
  if (!deleted_files.empty()) {
5069
- vstorage->ComputeCompactionScore(*cfd->ioptions(),
5070
- *cfd->GetLatestMutableCFOptions());
4988
+ vstorage->ComputeCompactionScore(cfd->ioptions(),
4989
+ cfd->GetLatestMutableCFOptions());
5071
4990
  }
5072
4991
  if (edit.GetDeletedFiles().empty()) {
5073
4992
  job_context.Clean();
5074
4993
  return status;
5075
4994
  }
5076
4995
  input_version->Ref();
5077
- status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
5078
- read_options, write_options, &edit, &mutex_,
5079
- directories_.GetDbDir());
4996
+ status = versions_->LogAndApply(cfd, read_options, write_options, &edit,
4997
+ &mutex_, directories_.GetDbDir());
5080
4998
  if (status.ok()) {
5081
4999
  InstallSuperVersionAndScheduleWork(
5082
- cfd, job_context.superversion_contexts.data(),
5083
- *cfd->GetLatestMutableCFOptions());
5000
+ cfd, job_context.superversion_contexts.data());
5084
5001
  }
5085
5002
  for (auto* deleted_file : deleted_files) {
5086
5003
  deleted_file->being_compacted = false;
@@ -5114,7 +5031,6 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
5114
5031
  assert(column_family);
5115
5032
  auto* cfd =
5116
5033
  static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
5117
- auto* sv = GetAndRefSuperVersion(cfd);
5118
5034
  {
5119
5035
  // Without mutex, Version::GetColumnFamilyMetaData will have data race
5120
5036
  // with Compaction::MarkFilesBeingCompacted. One solution is to use mutex,
@@ -5126,9 +5042,8 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
5126
5042
  // DB::GetColumnFamilyMetaData is not called frequently, the regression
5127
5043
  // should not be big. We still need to keep an eye on it.
5128
5044
  InstrumentedMutexLock l(&mutex_);
5129
- sv->current->GetColumnFamilyMetaData(cf_meta);
5045
+ cfd->current()->GetColumnFamilyMetaData(cf_meta);
5130
5046
  }
5131
- ReturnAndCleanupSuperVersion(cfd, sv);
5132
5047
  }
5133
5048
 
5134
5049
  void DBImpl::GetAllColumnFamilyMetaData(
@@ -5557,12 +5472,7 @@ Status DBImpl::WriteOptionsFile(const WriteOptions& write_options,
5557
5472
  if (!s.ok()) {
5558
5473
  ROCKS_LOG_WARN(immutable_db_options_.info_log,
5559
5474
  "Unnable to persist options -- %s", s.ToString().c_str());
5560
- if (immutable_db_options_.fail_if_options_file_error) {
5561
- s = Status::IOError("Unable to persist options.", s.ToString().c_str());
5562
- } else {
5563
- // Ignore error
5564
- s = Status::OK();
5565
- }
5475
+ s = Status::IOError("Unable to persist options.", s.ToString().c_str());
5566
5476
  }
5567
5477
 
5568
5478
  // Restore lock if appropriate
@@ -5679,7 +5589,7 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name,
5679
5589
  void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
5680
5590
  if (immutable_db_options_.enable_thread_tracking) {
5681
5591
  ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
5682
- cfd->ioptions()->env);
5592
+ cfd->ioptions().env);
5683
5593
  }
5684
5594
  }
5685
5595
 
@@ -5897,6 +5807,7 @@ Status DBImpl::IngestExternalFile(
5897
5807
 
5898
5808
  Status DBImpl::IngestExternalFiles(
5899
5809
  const std::vector<IngestExternalFileArg>& args) {
5810
+ PERF_TIMER_GUARD(file_ingestion_nanos);
5900
5811
  // TODO: plumb Env::IOActivity, Env::IOPriority
5901
5812
  const WriteOptions write_options;
5902
5813
 
@@ -5943,6 +5854,27 @@ Status DBImpl::IngestExternalFiles(
5943
5854
  "timestamps enabled doesn't support ingest behind.");
5944
5855
  }
5945
5856
  }
5857
+ if (arg.atomic_replace_range.has_value()) {
5858
+ if (ingest_opts.ingest_behind) {
5859
+ return Status::InvalidArgument(
5860
+ "Can't combine atomic_replace_range with ingest_behind.");
5861
+ }
5862
+ if (ingest_opts.snapshot_consistency) {
5863
+ // TODO: support generating and ingesting a big tombstone file, which
5864
+ // might depend on non-nullptr start and limit
5865
+ return Status::NotSupported(
5866
+ "atomic_replace_range not yet supported with "
5867
+ "snapshot_consistency.");
5868
+ } else {
5869
+ if (arg.atomic_replace_range->start.has_value() ^
5870
+ arg.atomic_replace_range->limit.has_value()) {
5871
+ return Status::NotSupported(
5872
+ "Only one of atomic_replace_range.{start,limit}.has_value() is "
5873
+ "not supported.");
5874
+ }
5875
+ }
5876
+ }
5877
+
5946
5878
  if (ingest_opts.allow_db_generated_files) {
5947
5879
  if (ingest_opts.write_global_seqno) {
5948
5880
  return Status::NotSupported(
@@ -5991,8 +5923,8 @@ Status DBImpl::IngestExternalFiles(
5991
5923
  this);
5992
5924
  Status es = ingestion_jobs[i].Prepare(
5993
5925
  args[i].external_files, args[i].files_checksums,
5994
- args[i].files_checksum_func_names, args[i].file_temperature,
5995
- start_file_number, super_version);
5926
+ args[i].files_checksum_func_names, args[i].atomic_replace_range,
5927
+ args[i].file_temperature, start_file_number, super_version);
5996
5928
  // capture first error only
5997
5929
  if (!es.ok() && status.ok()) {
5998
5930
  status = es;
@@ -6007,8 +5939,8 @@ Status DBImpl::IngestExternalFiles(
6007
5939
  this);
6008
5940
  Status es = ingestion_jobs[0].Prepare(
6009
5941
  args[0].external_files, args[0].files_checksums,
6010
- args[0].files_checksum_func_names, args[0].file_temperature,
6011
- next_file_number, super_version);
5942
+ args[0].files_checksum_func_names, args[0].atomic_replace_range,
5943
+ args[0].file_temperature, next_file_number, super_version);
6012
5944
  if (!es.ok()) {
6013
5945
  status = es;
6014
5946
  }
@@ -6041,6 +5973,7 @@ Status DBImpl::IngestExternalFiles(
6041
5973
  if (two_write_queues_) {
6042
5974
  nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
6043
5975
  }
5976
+ PERF_TIMER_GUARD(file_ingestion_blocking_live_writes_nanos);
6044
5977
 
6045
5978
  // When unordered_write is enabled, the keys are writing to memtable in an
6046
5979
  // unordered way. If the ingestion job checks memtable key range before the
@@ -6051,6 +5984,7 @@ Status DBImpl::IngestExternalFiles(
6051
5984
 
6052
5985
  num_running_ingest_file_ += static_cast<int>(num_cfs);
6053
5986
  TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
5987
+ TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter:2");
6054
5988
 
6055
5989
  bool at_least_one_cf_need_flush = false;
6056
5990
  std::vector<bool> need_flush(num_cfs, false);
@@ -6121,14 +6055,12 @@ Status DBImpl::IngestExternalFiles(
6121
6055
  ReadOptions read_options;
6122
6056
  read_options.fill_cache = args[0].options.fill_cache;
6123
6057
  autovector<ColumnFamilyData*> cfds_to_commit;
6124
- autovector<const MutableCFOptions*> mutable_cf_options_list;
6125
6058
  autovector<autovector<VersionEdit*>> edit_lists;
6126
6059
  uint32_t num_entries = 0;
6127
6060
  for (size_t i = 0; i != num_cfs; ++i) {
6128
6061
  auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
6129
6062
  assert(!cfd->IsDropped());
6130
6063
  cfds_to_commit.push_back(cfd);
6131
- mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
6132
6064
  autovector<VersionEdit*> edit_list;
6133
6065
  edit_list.push_back(ingestion_jobs[i].edit());
6134
6066
  edit_lists.push_back(edit_list);
@@ -6143,10 +6075,10 @@ Status DBImpl::IngestExternalFiles(
6143
6075
  }
6144
6076
  assert(0 == num_entries);
6145
6077
  }
6146
- status = versions_->LogAndApply(
6147
- cfds_to_commit, mutable_cf_options_list, read_options, write_options,
6078
+ status =
6079
+ versions_->LogAndApply(cfds_to_commit, read_options, write_options,
6148
6080
 
6149
- edit_lists, &mutex_, directories_.GetDbDir());
6081
+ edit_lists, &mutex_, directories_.GetDbDir());
6150
6082
  // It is safe to update VersionSet last seqno here after LogAndApply since
6151
6083
  // LogAndApply persists last sequence number from VersionEdits,
6152
6084
  // which are from file's largest seqno and not from VersionSet.
@@ -6178,8 +6110,7 @@ Status DBImpl::IngestExternalFiles(
6178
6110
  for (size_t i = 0; i != num_cfs; ++i) {
6179
6111
  auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
6180
6112
  assert(!cfd->IsDropped());
6181
- InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
6182
- *cfd->GetLatestMutableCFOptions());
6113
+ InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i]);
6183
6114
  #ifndef NDEBUG
6184
6115
  if (0 == i && num_cfs > 1) {
6185
6116
  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
@@ -6203,6 +6134,7 @@ Status DBImpl::IngestExternalFiles(
6203
6134
  nonmem_write_thread_.ExitUnbatched(&nonmem_w);
6204
6135
  }
6205
6136
  write_thread_.ExitUnbatched(&w);
6137
+ PERF_TIMER_STOP(file_ingestion_blocking_live_writes_nanos);
6206
6138
 
6207
6139
  if (status.ok()) {
6208
6140
  for (auto& job : ingestion_jobs) {
@@ -6297,12 +6229,11 @@ Status DBImpl::CreateColumnFamilyWithImport(
6297
6229
  // and this will overwrite the external file. To protect the external
6298
6230
  // file, we have to make sure the file number will never being reused.
6299
6231
  next_file_number = versions_->FetchAddFileNumber(total_file_num);
6300
- auto cf_options = cfd->GetLatestMutableCFOptions();
6301
6232
  status =
6302
- versions_->LogAndApply(cfd, *cf_options, read_options, write_options,
6303
- &dummy_edit, &mutex_, directories_.GetDbDir());
6233
+ versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit,
6234
+ &mutex_, directories_.GetDbDir());
6304
6235
  if (status.ok()) {
6305
- InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
6236
+ InstallSuperVersionForConfigChange(cfd, &dummy_sv_ctx);
6306
6237
  }
6307
6238
  }
6308
6239
  }
@@ -6335,12 +6266,11 @@ Status DBImpl::CreateColumnFamilyWithImport(
6335
6266
 
6336
6267
  // Install job edit [Mutex will be unlocked here]
6337
6268
  if (status.ok()) {
6338
- auto cf_options = cfd->GetLatestMutableCFOptions();
6339
- status = versions_->LogAndApply(cfd, *cf_options, read_options,
6340
- write_options, import_job.edit(),
6341
- &mutex_, directories_.GetDbDir());
6269
+ status = versions_->LogAndApply(cfd, read_options, write_options,
6270
+ import_job.edit(), &mutex_,
6271
+ directories_.GetDbDir());
6342
6272
  if (status.ok()) {
6343
- InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
6273
+ InstallSuperVersionForConfigChange(cfd, &sv_context);
6344
6274
  }
6345
6275
  }
6346
6276
 
@@ -6401,9 +6331,9 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family,
6401
6331
 
6402
6332
  if (status.ok()) {
6403
6333
  // DeleteFilesInRanges non-overlap files except L0
6404
- std::vector<RangePtr> ranges;
6405
- ranges.emplace_back(nullptr, &begin_key);
6406
- ranges.emplace_back(&end_key, nullptr);
6334
+ std::vector<RangeOpt> ranges;
6335
+ ranges.emplace_back(OptSlice{}, begin_key);
6336
+ ranges.emplace_back(end_key, OptSlice{});
6407
6337
  status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size());
6408
6338
  }
6409
6339
 
@@ -6541,7 +6471,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
6541
6471
  const auto& fd = fd_with_krange.fd;
6542
6472
  const FileMetaData* fmeta = fd_with_krange.file_metadata;
6543
6473
  assert(fmeta);
6544
- std::string fname = TableFileName(cfd->ioptions()->cf_paths,
6474
+ std::string fname = TableFileName(cfd->ioptions().cf_paths,
6545
6475
  fd.GetNumber(), fd.GetPathId());
6546
6476
  if (use_file_checksum) {
6547
6477
  s = VerifyFullFileChecksum(fmeta->file_checksum,
@@ -6565,7 +6495,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
6565
6495
  const uint64_t blob_file_number = meta->GetBlobFileNumber();
6566
6496
 
6567
6497
  const std::string blob_file_name = BlobFileName(
6568
- cfd->ioptions()->cf_paths.front().path, blob_file_number);
6498
+ cfd->ioptions().cf_paths.front().path, blob_file_number);
6569
6499
  s = VerifyFullFileChecksum(meta->GetChecksumValue(),
6570
6500
  meta->GetChecksumMethod(), blob_file_name,
6571
6501
  read_options);
@@ -6758,16 +6688,15 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion(
6758
6688
  pending_output_elem.reset(new std::list<uint64_t>::iterator(
6759
6689
  CaptureCurrentFileNumberInPendingOutputs()));
6760
6690
  *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
6761
- auto cf_options = cfd->GetLatestMutableCFOptions();
6762
6691
  VersionEdit dummy_edit;
6763
6692
  // If crash happen after a hard link established, Recover function may
6764
6693
  // reuse the file number that has already assigned to the internal file,
6765
6694
  // and this will overwrite the external file. To protect the external
6766
6695
  // file, we have to make sure the file number will never being reused.
6767
- s = versions_->LogAndApply(cfd, *cf_options, read_options, write_options,
6768
- &dummy_edit, &mutex_, directories_.GetDbDir());
6696
+ s = versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit,
6697
+ &mutex_, directories_.GetDbDir());
6769
6698
  if (s.ok()) {
6770
- InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
6699
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx);
6771
6700
  }
6772
6701
  dummy_sv_ctx.Clean();
6773
6702
  return s;
@@ -6801,60 +6730,199 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
6801
6730
  }
6802
6731
  }
6803
6732
 
6804
- void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
6733
+ std::pair<SequenceNumber, uint64_t> DBImpl::GetSeqnoToTimeSample() const {
6805
6734
  // TECHNICALITY: Sample last sequence number *before* time, as prescribed
6806
6735
  // for SeqnoToTimeMapping. We don't know how long it has been since the last
6807
6736
  // sequence number was written, so we at least have a one-sided bound by
6808
6737
  // sampling in this order.
6738
+ // ALSO, to avoid out-of-order mappings, we need to get the seqno and times
6739
+ // while holding the DB mutex. (This is really to make testing happy because
6740
+ // it's fine to throw out extra close-but-not-quite-consistent mappings in
6741
+ // production.)
6742
+ mutex_.AssertHeld();
6809
6743
  SequenceNumber seqno = GetLatestSequenceNumber();
6744
+ // HACK/TODO: seqno might be zero but we can't record a mapping for that.
6745
+ // Start with 1, which should be close enough.
6746
+ seqno = std::max(seqno, SequenceNumber{1});
6810
6747
  int64_t unix_time_signed = 0;
6811
6748
  immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
6812
6749
  .PermitUncheckedError(); // Ignore error
6813
- uint64_t unix_time = static_cast<uint64_t>(unix_time_signed);
6750
+ return {seqno, static_cast<uint64_t>(unix_time_signed)};
6751
+ }
6814
6752
 
6815
- std::vector<SuperVersionContext> sv_contexts;
6816
- if (populate_historical_seconds > 0) {
6817
- bool success = true;
6818
- {
6819
- InstrumentedMutexLock l(&mutex_);
6820
- if (seqno > 1 && unix_time > populate_historical_seconds) {
6821
- // seqno=0 is reserved
6822
- SequenceNumber from_seqno = 1;
6823
- success = seqno_to_time_mapping_.PrePopulate(
6824
- from_seqno, seqno, unix_time - populate_historical_seconds,
6825
- unix_time);
6826
- InstallSeqnoToTimeMappingInSV(&sv_contexts);
6827
- } else {
6828
- // One of these will fail
6829
- assert(seqno > 1);
6830
- assert(unix_time > populate_historical_seconds);
6831
- success = false;
6832
- }
6833
- }
6834
- if (success) {
6835
- ROCKS_LOG_INFO(
6836
- immutable_db_options_.info_log,
6837
- "Pre-populated sequence number to time entries: [1,%" PRIu64
6838
- "] -> [%" PRIu64 ",%" PRIu64 "]",
6839
- seqno, unix_time - populate_historical_seconds, unix_time);
6840
- } else {
6841
- ROCKS_LOG_WARN(
6842
- immutable_db_options_.info_log,
6843
- "Failed to pre-populate sequence number to time entries: [1,%" PRIu64
6844
- "] -> [%" PRIu64 ",%" PRIu64 "]",
6845
- seqno, unix_time - populate_historical_seconds, unix_time);
6846
- }
6753
+ void DBImpl::EnsureSeqnoToTimeMapping(
6754
+ const MinAndMaxPreserveSeconds& preserve_info) {
6755
+ mutex_.AssertHeld();
6756
+ assert(preserve_info.IsEnabled());
6757
+
6758
+ // Atomically with CF creation or mutable option change (see
6759
+ // InstallSuperVersionForConfigChange()), we need to be sure any data written
6760
+ // after setting preserve/preclude options must have a reasonable time
6761
+ // estimate (so that we can accurately place the data), which means at least
6762
+ // one entry in seqno_to_time_mapping_. It's not critical that `preserve_info`
6763
+ // take into account all CFs, as that's mostly relevant to how we add
6764
+ // recurring entries and purge old ones.
6765
+
6766
+ auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
6767
+ // Ensure at least one sample that is sufficiently recent
6768
+ uint64_t unix_time_last_sample = 0;
6769
+ if (seqno_to_time_mapping_.Empty()) {
6770
+ // The exact best settings will be found and applied in
6771
+ // RegisterRecordSeqnoTimeWorker()
6772
+ seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries);
6847
6773
  } else {
6848
- InstrumentedMutexLock l(&mutex_);
6849
- // FIXME: assert(seqno > 0);
6774
+ unix_time_last_sample =
6775
+ seqno_to_time_mapping_.GetProximalTimeBeforeSeqno(kMaxSequenceNumber);
6776
+ }
6777
+ uint64_t cadence = preserve_info.GetRecodingCadence();
6778
+ // Extend cadence so as to avoid stepping on toes of recorder job, which
6779
+ // could lag a bit.
6780
+ cadence += 3 + cadence / 100;
6781
+ if (unix_time_now >= cadence &&
6782
+ unix_time_last_sample <= unix_time_now - cadence) {
6783
+ assert(seqno > 0); // See GetSeqnoToTimeSample()
6850
6784
  // Always successful assuming seqno never go backwards
6851
- seqno_to_time_mapping_.Append(seqno, unix_time);
6852
- InstallSeqnoToTimeMappingInSV(&sv_contexts);
6785
+ seqno_to_time_mapping_.Append(seqno, unix_time_now);
6786
+ }
6787
+ }
6788
+
6789
+ void DBImpl::PrepopulateSeqnoToTimeMapping(
6790
+ const MinAndMaxPreserveSeconds& preserve_info) {
6791
+ // Only for opening a new DB, with preserve/preclude options set
6792
+ if (!preserve_info.IsEnabled()) {
6793
+ assert(false);
6794
+ return;
6795
+ }
6796
+ if (GetLatestSequenceNumber() != 0) {
6797
+ assert(false);
6798
+ return;
6853
6799
  }
6854
6800
 
6855
- // clean up outside db mutex
6856
- for (SuperVersionContext& sv_context : sv_contexts) {
6857
- sv_context.Clean();
6801
+ // Here we fulfill the following promise:
6802
+ //
6803
+ // Any DB/CF created with preserve/preclude options set from the beginning
6804
+ // will get pre-allocated seqnos with pre-populated time mappings back to
6805
+ // the times we are interested in. (This will enable future import of data
6806
+ // while preserving rough write time. We can only do this reliably from
6807
+ // DB::Open, as otherwise there could be a race between CreateColumnFamily
6808
+ // and the first Write to the DB, and seqno-to-time mappings need to be
6809
+ // monotonic.
6810
+ //
6811
+ // FIXME: We don't currently guarantee that if the first column family with
6812
+ // that setting is added or configured after initial DB::Open but before
6813
+ // the first user Write. Fixing this causes complications with the crash
6814
+ // test because if DB starts without preserve/preclude option, does some
6815
+ // user writes but all those writes are lost in crash, then re-opens with
6816
+ // preserve/preclude option, it sees seqno==1 which looks like one of the
6817
+ // user writes was recovered, when actually it was not.
6818
+
6819
+ // Pre-allocate seqnos and pre-populate historical mapping
6820
+ // We can simply modify these, before writes are allowed
6821
+ constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
6822
+ versions_->SetLastAllocatedSequence(kMax);
6823
+ versions_->SetLastPublishedSequence(kMax);
6824
+ versions_->SetLastSequence(kMax);
6825
+
6826
+ // And record in manifest, to avoid going backwards in seqno on re-open
6827
+ // (potentially with different options). Concurrency is simple because we
6828
+ // are in DB::Open
6829
+ const WriteOptions write_options(Env::IOActivity::kDBOpen);
6830
+ const ReadOptions read_options(Env::IOActivity::kDBOpen);
6831
+ VersionEdit edit;
6832
+ edit.SetLastSequence(kMax);
6833
+ Status s = versions_->LogAndApplyToDefaultColumnFamily(
6834
+ read_options, write_options, &edit, &mutex_, directories_.GetDbDir());
6835
+ if (!s.ok() && versions_->io_status().IsIOError()) {
6836
+ error_handler_.SetBGError(versions_->io_status(),
6837
+ BackgroundErrorReason::kManifestWrite);
6838
+ }
6839
+
6840
+ auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
6841
+ uint64_t populate_historical_seconds = preserve_info.max_preserve_seconds;
6842
+ if (seqno > 1 && unix_time_now > populate_historical_seconds) {
6843
+ // seqno=0 is reserved
6844
+ SequenceNumber from_seqno = 1;
6845
+ seqno_to_time_mapping_.PrePopulate(
6846
+ from_seqno, seqno, unix_time_now - populate_historical_seconds,
6847
+ unix_time_now);
6848
+ } else {
6849
+ // One of these will fail
6850
+ assert(seqno > 1);
6851
+ assert(unix_time_now > populate_historical_seconds);
6852
+ }
6853
+ }
6854
+
6855
+ void DBImpl::InstallSuperVersionForConfigChange(
6856
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context) {
6857
+ MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
6858
+ std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping;
6859
+ if (preserve_info.IsEnabled()) {
6860
+ // TODO: detect & optimize if mapping hasn't changed from previous
6861
+ // SuperVersion
6862
+ EnsureSeqnoToTimeMapping(preserve_info);
6863
+ new_seqno_to_time_mapping = std::make_shared<SeqnoToTimeMapping>();
6864
+ new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
6865
+ }
6866
+ InstallSuperVersionAndScheduleWork(cfd, sv_context,
6867
+ std::move(new_seqno_to_time_mapping));
6868
+ }
6869
+
6870
+ void DBImpl::RecordSeqnoToTimeMapping() {
6871
+ SuperVersionContext sv_context;
6872
+ {
6873
+ InstrumentedMutexLock l(&mutex_);
6874
+ // Record next sample
6875
+ seqno_to_time_mapping_.Append(GetSeqnoToTimeSample());
6876
+ // Create an immutable snapshot for sharing across CFs
6877
+ std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
6878
+ std::make_shared<SeqnoToTimeMapping>();
6879
+ new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
6880
+
6881
+ // Update in SV of all applicable CFs
6882
+ for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
6883
+ if (cfd->IsDropped()) {
6884
+ continue;
6885
+ }
6886
+ MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
6887
+ if (preserve_info.IsEnabled()) {
6888
+ sv_context.NewSuperVersion();
6889
+ cfd->InstallSuperVersion(&sv_context, &mutex_,
6890
+ new_seqno_to_time_mapping);
6891
+ }
6892
+ }
6893
+ bg_cv_.SignalAll();
6894
+ }
6895
+
6896
+ // clean up & report outside db mutex
6897
+ sv_context.Clean();
6898
+ }
6899
+
6900
+ void DBImpl::TriggerPeriodicCompaction() {
6901
+ TEST_SYNC_POINT("DBImpl::TriggerPeriodicCompaction:StartRunning");
6902
+ {
6903
+ InstrumentedMutexLock l(&mutex_);
6904
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
6905
+ "Running the periodic task to trigger compactions.");
6906
+
6907
+ for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
6908
+ if (cfd->IsDropped()) {
6909
+ continue;
6910
+ }
6911
+ if (cfd->GetLatestCFOptions().periodic_compaction_seconds &&
6912
+ !cfd->queued_for_compaction()) {
6913
+ cfd->current()->storage_info()->ComputeCompactionScore(
6914
+ cfd->ioptions(), cfd->GetLatestMutableCFOptions());
6915
+ EnqueuePendingCompaction(cfd);
6916
+ if (cfd->queued_for_compaction()) {
6917
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
6918
+ "Periodic task to trigger compaction queued Column "
6919
+ "family [%s] for compaction.",
6920
+ cfd->GetName().c_str());
6921
+ }
6922
+ }
6923
+ }
6924
+ MaybeScheduleFlushOrCompaction();
6925
+ bg_cv_.SignalAll();
6858
6926
  }
6859
6927
  }
6860
6928
 
@@ -6914,22 +6982,4 @@ void DBImpl::TrackOrUntrackFiles(
6914
6982
  }
6915
6983
  }
6916
6984
 
6917
- void DBImpl::InstallSeqnoToTimeMappingInSV(
6918
- std::vector<SuperVersionContext>* sv_contexts) {
6919
- mutex_.AssertHeld();
6920
- std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
6921
- std::make_shared<SeqnoToTimeMapping>();
6922
- new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
6923
- for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
6924
- if (cfd->IsDropped()) {
6925
- continue;
6926
- }
6927
- sv_contexts->emplace_back(/*create_superversion=*/true);
6928
- sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping;
6929
- cfd->InstallSuperVersion(&sv_contexts->back(),
6930
- *(cfd->GetLatestMutableCFOptions()));
6931
- }
6932
- bg_cv_.SignalAll();
6933
- }
6934
-
6935
6985
  } // namespace ROCKSDB_NAMESPACE