@nxtedition/rocksdb 13.5.7 → 13.5.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (509) hide show
  1. package/binding.cc +248 -70
  2. package/binding.gyp +2 -2
  3. package/deps/rocksdb/rocksdb/BUCK +12 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -0
  5. package/deps/rocksdb/rocksdb/Makefile +28 -23
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -1
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -2
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +43 -39
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +0 -1
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -3
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +1 -3
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +11 -1
  15. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +13 -5
  16. package/deps/rocksdb/rocksdb/crash_test.mk +61 -15
  17. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +136 -45
  18. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +34 -16
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +10 -7
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -2
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +1 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +12 -9
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +3 -4
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +2 -2
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +3 -4
  26. package/deps/rocksdb/rocksdb/db/builder.cc +22 -8
  27. package/deps/rocksdb/rocksdb/db/builder.h +5 -4
  28. package/deps/rocksdb/rocksdb/db/c.cc +556 -15
  29. package/deps/rocksdb/rocksdb/db/c_test.c +133 -12
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +114 -50
  31. package/deps/rocksdb/rocksdb/db/column_family.h +53 -36
  32. package/deps/rocksdb/rocksdb/db/column_family_test.cc +6 -6
  33. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +0 -1
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +95 -70
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +71 -51
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -86
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +26 -68
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +0 -122
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +453 -258
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +117 -92
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +0 -1
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +38 -38
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +24 -17
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +34 -45
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -31
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -3
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +1 -1
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +10 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +82 -34
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +267 -179
  53. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +4 -1
  54. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +273 -89
  55. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +300 -14
  56. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +4 -4
  57. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +2 -2
  58. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +28 -23
  59. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +69 -51
  60. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +522 -245
  61. package/deps/rocksdb/rocksdb/db/convenience.cc +15 -4
  62. package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -3
  63. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +0 -2
  64. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +196 -17
  65. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +74 -62
  66. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +48 -0
  67. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +682 -250
  68. package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +0 -1
  69. package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +3 -4
  70. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +11 -16
  71. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +57 -0
  72. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +2 -2
  73. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -1
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +540 -490
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +347 -188
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +584 -217
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +13 -9
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -7
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +40 -36
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -3
  81. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +751 -372
  82. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +35 -32
  83. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +24 -2
  84. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +125 -63
  85. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
  86. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +311 -196
  87. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +15 -5
  88. package/deps/rocksdb/rocksdb/db/db_iter.cc +42 -29
  89. package/deps/rocksdb/rocksdb/db/db_iter.h +96 -31
  90. package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +3 -4
  91. package/deps/rocksdb/rocksdb/db/db_iter_test.cc +168 -228
  92. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +454 -0
  93. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +8 -8
  94. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +0 -1
  95. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +90 -0
  96. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +60 -2
  97. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +7 -3
  98. package/deps/rocksdb/rocksdb/db/db_options_test.cc +85 -27
  99. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -1
  100. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +0 -2
  101. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +114 -2
  102. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -1
  103. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +0 -1
  104. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +51 -3
  105. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +0 -1
  106. package/deps/rocksdb/rocksdb/db/db_test.cc +325 -18
  107. package/deps/rocksdb/rocksdb/db/db_test2.cc +644 -20
  108. package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
  109. package/deps/rocksdb/rocksdb/db/db_test_util.h +9 -0
  110. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +64 -45
  111. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +203 -14
  112. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +259 -30
  113. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +0 -1
  114. package/deps/rocksdb/rocksdb/db/db_write_test.cc +75 -1
  115. package/deps/rocksdb/rocksdb/db/dbformat.h +70 -6
  116. package/deps/rocksdb/rocksdb/db/deletefile_test.cc +0 -190
  117. package/deps/rocksdb/rocksdb/db/error_handler.cc +22 -7
  118. package/deps/rocksdb/rocksdb/db/error_handler.h +16 -1
  119. package/deps/rocksdb/rocksdb/db/event_helpers.cc +41 -26
  120. package/deps/rocksdb/rocksdb/db/experimental.cc +4 -3
  121. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +464 -78
  122. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +166 -69
  123. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +54 -25
  124. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -3
  125. package/deps/rocksdb/rocksdb/db/flush_job.cc +98 -81
  126. package/deps/rocksdb/rocksdb/db/flush_job.h +4 -9
  127. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +80 -84
  128. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +1 -1
  129. package/deps/rocksdb/rocksdb/db/forward_iterator.h +2 -2
  130. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +12 -19
  131. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +0 -2
  132. package/deps/rocksdb/rocksdb/db/internal_stats.cc +41 -15
  133. package/deps/rocksdb/rocksdb/db/internal_stats.h +63 -52
  134. package/deps/rocksdb/rocksdb/db/job_context.h +59 -24
  135. package/deps/rocksdb/rocksdb/db/listener_test.cc +69 -10
  136. package/deps/rocksdb/rocksdb/db/log_format.h +11 -2
  137. package/deps/rocksdb/rocksdb/db/log_reader.cc +147 -34
  138. package/deps/rocksdb/rocksdb/db/log_reader.h +40 -11
  139. package/deps/rocksdb/rocksdb/db/log_test.cc +16 -3
  140. package/deps/rocksdb/rocksdb/db/log_writer.cc +102 -55
  141. package/deps/rocksdb/rocksdb/db/log_writer.h +21 -2
  142. package/deps/rocksdb/rocksdb/db/malloc_stats.h +0 -2
  143. package/deps/rocksdb/rocksdb/db/memtable.cc +16 -47
  144. package/deps/rocksdb/rocksdb/db/memtable.h +76 -12
  145. package/deps/rocksdb/rocksdb/db/memtable_list.cc +23 -20
  146. package/deps/rocksdb/rocksdb/db/memtable_list.h +9 -11
  147. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +18 -37
  148. package/deps/rocksdb/rocksdb/db/merge_context.h +2 -1
  149. package/deps/rocksdb/rocksdb/db/merge_test.cc +8 -0
  150. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +3 -5
  151. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +15 -7
  152. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +6 -3
  153. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +22 -4
  154. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +41 -1
  155. package/deps/rocksdb/rocksdb/db/prefix_test.cc +0 -1
  156. package/deps/rocksdb/rocksdb/db/repair.cc +29 -34
  157. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -1
  158. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +14 -15
  159. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +1 -3
  160. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +47 -1
  161. package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -3
  162. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +1 -3
  163. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +2 -1
  164. package/deps/rocksdb/rocksdb/db/version_builder.cc +2 -2
  165. package/deps/rocksdb/rocksdb/db/version_edit.cc +8 -37
  166. package/deps/rocksdb/rocksdb/db/version_edit.h +32 -1
  167. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +26 -18
  168. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -5
  169. package/deps/rocksdb/rocksdb/db/version_set.cc +282 -197
  170. package/deps/rocksdb/rocksdb/db/version_set.h +54 -57
  171. package/deps/rocksdb/rocksdb/db/version_set_test.cc +28 -35
  172. package/deps/rocksdb/rocksdb/db/version_util.h +2 -3
  173. package/deps/rocksdb/rocksdb/db/wal_manager.cc +3 -2
  174. package/deps/rocksdb/rocksdb/db/wal_manager.h +0 -1
  175. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +0 -1
  176. package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +1 -0
  177. package/deps/rocksdb/rocksdb/db/write_batch.cc +22 -8
  178. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +5 -4
  179. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +7 -6
  180. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -4
  181. package/deps/rocksdb/rocksdb/db/write_thread.h +3 -3
  182. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +13 -5
  183. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +9 -2
  184. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +39 -0
  185. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +65 -0
  186. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +45 -22
  187. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +7 -4
  188. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +22 -5
  189. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +28 -3
  190. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -38
  191. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -3
  192. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +80 -32
  193. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +51 -2
  194. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +23 -1
  195. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +305 -15
  196. package/deps/rocksdb/rocksdb/env/env.cc +32 -2
  197. package/deps/rocksdb/rocksdb/env/env_encryption.cc +0 -2
  198. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +2 -4
  199. package/deps/rocksdb/rocksdb/env/env_posix.cc +4 -2
  200. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -1
  201. package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -11
  202. package/deps/rocksdb/rocksdb/env/fs_readonly.h +0 -2
  203. package/deps/rocksdb/rocksdb/env/fs_remap.cc +0 -2
  204. package/deps/rocksdb/rocksdb/env/fs_remap.h +0 -2
  205. package/deps/rocksdb/rocksdb/env/io_posix.cc +6 -4
  206. package/deps/rocksdb/rocksdb/env/io_posix.h +3 -2
  207. package/deps/rocksdb/rocksdb/env/mock_env.cc +0 -1
  208. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +2 -2
  209. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +0 -2
  210. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +0 -2
  211. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +30 -21
  212. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +16 -0
  213. package/deps/rocksdb/rocksdb/file/file_util.cc +32 -14
  214. package/deps/rocksdb/rocksdb/file/file_util.h +22 -5
  215. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +229 -76
  216. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +21 -12
  217. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +10 -7
  218. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +12 -8
  219. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +1 -2
  220. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +0 -2
  221. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +3 -3
  222. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +598 -0
  223. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_iterator.h +36 -0
  224. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +70 -11
  225. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +232 -11
  226. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -1
  227. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -1
  228. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +149 -15
  229. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +17 -2
  230. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +132 -34
  231. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +158 -79
  232. package/deps/rocksdb/rocksdb/include/rocksdb/db_bench_tool.h +2 -1
  233. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -5
  234. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +1 -3
  235. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
  236. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +275 -0
  237. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +2 -1
  238. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +50 -5
  239. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +10 -0
  240. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +13 -0
  241. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +0 -1
  242. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +5 -2
  243. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +13 -0
  244. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +237 -0
  245. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +230 -39
  246. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +15 -0
  247. package/deps/rocksdb/rocksdb/include/rocksdb/perf_level.h +31 -11
  248. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +41 -0
  249. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +1 -1
  250. package/deps/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h +0 -1
  251. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +5 -1
  252. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +0 -1
  253. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -3
  254. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +2 -0
  255. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -8
  256. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +19 -2
  257. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -1
  258. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +124 -0
  259. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
  260. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +26 -1
  261. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +55 -6
  262. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +3 -5
  263. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +0 -2
  264. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -2
  265. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +0 -1
  266. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +1 -2
  267. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +0 -1
  268. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +96 -8
  269. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index_faiss.h +117 -0
  270. package/deps/rocksdb/rocksdb/{utilities/secondary_index/faiss_ivf_index.h → include/rocksdb/utilities/secondary_index_simple.h} +11 -14
  271. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +26 -11
  272. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +16 -3
  273. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -2
  274. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +63 -7
  275. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +0 -1
  276. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +28 -12
  277. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +3 -3
  278. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +0 -2
  279. package/deps/rocksdb/rocksdb/logging/event_logger_test.cc +1 -2
  280. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +1 -1
  281. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +0 -1
  282. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +0 -1
  283. package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +3 -1
  284. package/deps/rocksdb/rocksdb/memtable/skiplist.h +2 -2
  285. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +2 -4
  286. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +69 -8
  287. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +32 -9
  288. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +58 -45
  289. package/deps/rocksdb/rocksdb/monitoring/histogram.h +1 -1
  290. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -3
  291. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +5 -0
  292. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +1 -1
  293. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +3 -2
  294. package/deps/rocksdb/rocksdb/options/cf_options.cc +44 -13
  295. package/deps/rocksdb/rocksdb/options/cf_options.h +21 -7
  296. package/deps/rocksdb/rocksdb/options/configurable.cc +5 -5
  297. package/deps/rocksdb/rocksdb/options/configurable_test.h +1 -2
  298. package/deps/rocksdb/rocksdb/options/customizable.cc +0 -1
  299. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -11
  300. package/deps/rocksdb/rocksdb/options/db_options.cc +18 -15
  301. package/deps/rocksdb/rocksdb/options/db_options.h +2 -2
  302. package/deps/rocksdb/rocksdb/options/options.cc +296 -305
  303. package/deps/rocksdb/rocksdb/options/options_helper.cc +188 -62
  304. package/deps/rocksdb/rocksdb/options/options_helper.h +3 -3
  305. package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -4
  306. package/deps/rocksdb/rocksdb/options/options_parser.h +0 -1
  307. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +17 -4
  308. package/deps/rocksdb/rocksdb/options/options_test.cc +101 -76
  309. package/deps/rocksdb/rocksdb/port/lang.h +2 -1
  310. package/deps/rocksdb/rocksdb/port/port_posix.cc +2 -1
  311. package/deps/rocksdb/rocksdb/port/stack_trace.cc +5 -4
  312. package/deps/rocksdb/rocksdb/port/win/env_win.cc +3 -2
  313. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +99 -1
  314. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +6 -0
  315. package/deps/rocksdb/rocksdb/src.mk +17 -11
  316. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +0 -1
  317. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1094 -929
  318. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +6 -19
  319. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +76 -22
  320. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +2 -0
  321. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +221 -131
  322. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +12 -9
  323. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +23 -24
  324. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -38
  325. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +7 -4
  326. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +5 -5
  327. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +10 -12
  328. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -4
  329. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +35 -43
  330. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +2 -1
  331. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -1
  332. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -2
  333. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -4
  334. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +0 -1
  335. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +3 -3
  336. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -3
  337. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -4
  338. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  339. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -5
  340. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +4 -4
  341. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +37 -35
  342. package/deps/rocksdb/rocksdb/table/block_fetcher.h +11 -7
  343. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +4 -3
  344. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +31 -5
  345. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +2 -1
  346. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +0 -1
  347. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +0 -1
  348. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +0 -1
  349. package/deps/rocksdb/rocksdb/table/external_table.cc +483 -0
  350. package/deps/rocksdb/rocksdb/table/format.cc +62 -44
  351. package/deps/rocksdb/rocksdb/table/format.h +35 -12
  352. package/deps/rocksdb/rocksdb/table/internal_iterator.h +3 -13
  353. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
  354. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +6 -0
  355. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +150 -141
  356. package/deps/rocksdb/rocksdb/table/meta_blocks.h +5 -0
  357. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -2
  358. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +8 -0
  359. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +0 -1
  360. package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +0 -2
  361. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +0 -2
  362. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +0 -1
  363. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +6 -6
  364. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  365. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +86 -7
  366. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +88 -2
  367. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +0 -1
  368. package/deps/rocksdb/rocksdb/table/table_builder.h +10 -1
  369. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +3 -2
  370. package/deps/rocksdb/rocksdb/table/table_test.cc +899 -22
  371. package/deps/rocksdb/rocksdb/test_util/testutil.cc +3 -4
  372. package/deps/rocksdb/rocksdb/test_util/testutil.h +132 -1
  373. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +0 -1
  374. package/deps/rocksdb/rocksdb/test_util/transaction_test_util.h +0 -2
  375. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +163 -77
  376. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +0 -2
  377. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +0 -1
  378. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +0 -1
  379. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +120 -52
  380. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -0
  381. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
  382. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +0 -2
  383. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +2 -2
  384. package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +0 -2
  385. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +2 -1
  386. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +94 -0
  387. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +0 -1
  388. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +0 -1
  389. package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +1 -1
  390. package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +2 -1
  391. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +3 -5
  392. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  393. package/deps/rocksdb/rocksdb/util/async_file_reader.h +15 -8
  394. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +131 -0
  395. package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +90 -0
  396. package/deps/rocksdb/rocksdb/util/autovector.h +1 -1
  397. package/deps/rocksdb/rocksdb/util/autovector_test.cc +2 -2
  398. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +0 -2
  399. package/deps/rocksdb/rocksdb/util/compression.cc +936 -4
  400. package/deps/rocksdb/rocksdb/util/compression.h +348 -232
  401. package/deps/rocksdb/rocksdb/util/compression_test.cc +229 -0
  402. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +10 -10
  403. package/deps/rocksdb/rocksdb/util/crc32c_ppc.c +1 -0
  404. package/deps/rocksdb/rocksdb/util/data_structure.cc +2 -0
  405. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -3
  406. package/deps/rocksdb/rocksdb/util/ppc-opcode.h +5 -5
  407. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +108 -0
  408. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +67 -0
  409. package/deps/rocksdb/rocksdb/util/slice_test.cc +83 -0
  410. package/deps/rocksdb/rocksdb/util/string_util.cc +0 -2
  411. package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
  412. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
  413. package/deps/rocksdb/rocksdb/util/udt_util.cc +18 -5
  414. package/deps/rocksdb/rocksdb/util/udt_util.h +10 -7
  415. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +650 -154
  416. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +438 -144
  417. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +0 -1
  418. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +0 -1
  419. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +0 -1
  420. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +16 -17
  421. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +2 -1
  422. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +0 -1
  423. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +0 -1
  424. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +7 -8
  425. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +4 -3
  426. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h +0 -1
  427. package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +0 -1
  428. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +2 -2
  429. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  430. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +0 -48
  431. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -1
  432. package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +0 -1
  433. package/deps/rocksdb/rocksdb/utilities/debug.cc +7 -14
  434. package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +0 -1
  435. package/deps/rocksdb/rocksdb/utilities/env_mirror_test.cc +0 -2
  436. package/deps/rocksdb/rocksdb/utilities/env_timed.cc +0 -1
  437. package/deps/rocksdb/rocksdb/utilities/env_timed_test.cc +0 -2
  438. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +5 -3
  439. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +10 -9
  440. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +0 -1
  441. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +0 -1
  442. package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +1 -0
  443. package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +0 -2
  444. package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +0 -1
  445. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +0 -1
  446. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +0 -1
  447. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +0 -2
  448. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +0 -2
  449. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc +0 -1
  450. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +0 -2
  451. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table.h +0 -2
  452. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +0 -2
  453. package/deps/rocksdb/rocksdb/utilities/persistent_cache/lrulist.h +0 -2
  454. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h +0 -2
  455. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc +0 -1
  456. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +0 -2
  457. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +0 -1
  458. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +0 -2
  459. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +183 -32
  460. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +258 -12
  461. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_helper.h +33 -0
  462. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_iterator.cc +99 -0
  463. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +280 -120
  464. package/deps/rocksdb/rocksdb/utilities/secondary_index/simple_secondary_index.cc +79 -0
  465. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +52 -16
  466. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h +10 -6
  467. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +55 -0
  468. package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +0 -1
  469. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +0 -2
  470. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h +0 -1
  471. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +37 -12
  472. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +2 -0
  473. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +0 -2
  474. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc +0 -2
  475. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +1 -1
  476. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h +1 -1
  477. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
  478. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +2 -1
  479. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +2 -2
  480. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +0 -1
  481. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.h +0 -2
  482. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +1 -3
  483. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +36 -10
  484. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -7
  485. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +4 -5
  486. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +1 -4
  487. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +1 -2
  488. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +0 -2
  489. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.h +0 -1
  490. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1118 -37
  491. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +4 -7
  492. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +0 -2
  493. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +0 -2
  494. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +3 -3
  495. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +0 -1
  496. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +0 -2
  497. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +1 -2
  498. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +1 -2
  499. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +0 -1
  500. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +0 -3
  501. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +125 -127
  502. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +45 -23
  503. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +54 -22
  504. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +477 -58
  505. package/deps/rocksdb/rocksdb.gyp +9 -4
  506. package/index.js +50 -9
  507. package/package.json +8 -1
  508. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  509. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -35,8 +35,8 @@ Options SanitizeOptions(const std::string& dbname, const Options& src,
35
35
  auto db_options =
36
36
  SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
37
37
  ImmutableDBOptions immutable_db_options(db_options);
38
- auto cf_options =
39
- SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
38
+ auto cf_options = SanitizeCfOptions(immutable_db_options, read_only,
39
+ ColumnFamilyOptions(src));
40
40
  return Options(db_options, cf_options);
41
41
  }
42
42
 
@@ -224,6 +224,12 @@ Status DBImpl::ValidateOptions(
224
224
  if (!s.ok()) {
225
225
  return s;
226
226
  }
227
+ if (cfd.name == kDefaultColumnFamilyName) {
228
+ if (cfd.options.disallow_memtable_writes) {
229
+ return Status::InvalidArgument(
230
+ "Default column family cannot use disallow_memtable_writes=true");
231
+ }
232
+ }
227
233
  }
228
234
  s = ValidateOptions(db_options);
229
235
  return s;
@@ -575,7 +581,7 @@ Status DBImpl::Recover(
575
581
  }
576
582
  if (s.ok() && !read_only) {
577
583
  for (auto cfd : *versions_->GetColumnFamilySet()) {
578
- auto& moptions = *cfd->GetLatestMutableCFOptions();
584
+ const auto& moptions = cfd->GetLatestMutableCFOptions();
579
585
  // Try to trivially move files down the LSM tree to start from bottommost
580
586
  // level when level_compaction_dynamic_level_bytes is enabled. This should
581
587
  // only be useful when user is migrating to turning on this option.
@@ -590,16 +596,16 @@ Status DBImpl::Recover(
590
596
  // the user wants to partition SST files.
591
597
  // Note that files moved in this step may not respect the compression
592
598
  // option in target level.
593
- if (cfd->ioptions()->compaction_style ==
599
+ if (cfd->ioptions().compaction_style ==
594
600
  CompactionStyle::kCompactionStyleLevel &&
595
- cfd->ioptions()->level_compaction_dynamic_level_bytes &&
601
+ cfd->ioptions().level_compaction_dynamic_level_bytes &&
596
602
  !moptions.disable_auto_compactions) {
597
- int to_level = cfd->ioptions()->num_levels - 1;
603
+ int to_level = cfd->ioptions().num_levels - 1;
598
604
  // last level is reserved
599
605
  // allow_ingest_behind does not support Level Compaction,
600
606
  // and per_key_placement can have infinite compaction loop for Level
601
607
  // Compaction. Adjust to_level here just to be safe.
602
- if (cfd->ioptions()->allow_ingest_behind ||
608
+ if (cfd->ioptions().allow_ingest_behind ||
603
609
  moptions.preclude_last_level_data_seconds > 0) {
604
610
  to_level -= 1;
605
611
  }
@@ -622,10 +628,10 @@ Status DBImpl::Recover(
622
628
  // lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with
623
629
  // 7 levels
624
630
  std::string lsm_state = "[";
625
- for (int i = 0; i < cfd->ioptions()->num_levels; ++i) {
631
+ for (int i = 0; i < cfd->ioptions().num_levels; ++i) {
626
632
  lsm_state += std::to_string(
627
633
  cfd->current()->storage_info()->NumLevelFiles(i));
628
- if (i < cfd->ioptions()->num_levels - 1) {
634
+ if (i < cfd->ioptions().num_levels - 1) {
629
635
  lsm_state += ",";
630
636
  }
631
637
  }
@@ -708,9 +714,9 @@ Status DBImpl::Recover(
708
714
  // may check this value to decide whether to flush.
709
715
  max_total_in_memory_state_ = 0;
710
716
  for (auto cfd : *versions_->GetColumnFamilySet()) {
711
- auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
712
- max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
713
- mutable_cf_options->max_write_buffer_number;
717
+ const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
718
+ max_total_in_memory_state_ += mutable_cf_options.write_buffer_size *
719
+ mutable_cf_options.max_write_buffer_number;
714
720
  }
715
721
 
716
722
  SequenceNumber next_sequence(kMaxSequenceNumber);
@@ -754,6 +760,11 @@ Status DBImpl::Recover(
754
760
  }
755
761
  }
756
762
 
763
+ if (immutable_db_options_.track_and_verify_wals && !is_new_db &&
764
+ !immutable_db_options_.best_efforts_recovery && wal_files.empty()) {
765
+ return Status::Corruption("Opening an existing DB with no WAL files");
766
+ }
767
+
757
768
  if (immutable_db_options_.track_and_verify_wals_in_manifest) {
758
769
  if (!immutable_db_options_.best_efforts_recovery) {
759
770
  // Verify WALs in MANIFEST.
@@ -816,8 +827,7 @@ Status DBImpl::Recover(
816
827
  if (!s.ok()) {
817
828
  // Clear memtables if recovery failed
818
829
  for (auto cfd : *versions_->GetColumnFamilySet()) {
819
- cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
820
- kMaxSequenceNumber);
830
+ cfd->CreateNewMemtable(kMaxSequenceNumber);
821
831
  }
822
832
  }
823
833
  }
@@ -983,8 +993,7 @@ Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
983
993
  const ReadOptions read_options(Env::IOActivity::kDBOpen);
984
994
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
985
995
 
986
- Status s = versions_->LogAndApply(recovery_ctx.cfds_,
987
- recovery_ctx.mutable_cf_opts_, read_options,
996
+ Status s = versions_->LogAndApply(recovery_ctx.cfds_, read_options,
988
997
  write_options, recovery_ctx.edit_lists_,
989
998
  &mutex_, directories_.GetDbDir());
990
999
  return s;
@@ -1103,50 +1112,64 @@ bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
1103
1112
  return true;
1104
1113
  }
1105
1114
 
1115
+ void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s,
1116
+ uint64_t log_number) {
1117
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
1118
+ (status == nullptr ? "(ignoring error) " : ""), fname,
1119
+ static_cast<int>(bytes), s.ToString().c_str());
1120
+ if (status != nullptr && status->ok()) {
1121
+ *status = s;
1122
+ corrupted_wal_number_ = log_number;
1123
+ }
1124
+ }
1125
+
1126
+ void DBOpenLogRecordReadReporter::OldLogRecord(size_t bytes) {
1127
+ if (old_log_record != nullptr) {
1128
+ *old_log_record = true;
1129
+ }
1130
+ ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled", fname,
1131
+ static_cast<int>(bytes));
1132
+ }
1133
+
1106
1134
  // REQUIRES: wal_numbers are sorted in ascending order
1107
1135
  Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1108
1136
  SequenceNumber* next_sequence, bool read_only,
1109
1137
  bool is_retry, bool* corrupted_wal_found,
1110
1138
  RecoveryContext* recovery_ctx) {
1111
- struct LogReporter : public log::Reader::Reporter {
1112
- Env* env;
1113
- Logger* info_log;
1114
- const char* fname;
1115
- Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
1116
- bool* old_log_record;
1117
- void Corruption(size_t bytes, const Status& s) override {
1118
- ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
1119
- (status == nullptr ? "(ignoring error) " : ""), fname,
1120
- static_cast<int>(bytes), s.ToString().c_str());
1121
- if (status != nullptr && status->ok()) {
1122
- *status = s;
1123
- }
1124
- }
1125
-
1126
- void OldLogRecord(size_t bytes) override {
1127
- if (old_log_record != nullptr) {
1128
- *old_log_record = true;
1129
- }
1130
- ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled",
1131
- fname, static_cast<int>(bytes));
1132
- }
1133
- };
1134
-
1135
1139
  mutex_.AssertHeld();
1136
- Status status;
1137
- bool old_log_record = false;
1140
+
1138
1141
  std::unordered_map<int, VersionEdit> version_edits;
1139
- // no need to refcount because iteration is under mutex
1142
+ int job_id = 0;
1143
+ uint64_t min_wal_number = 0;
1144
+ SetupLogFilesRecovery(wal_numbers, &version_edits, &job_id, &min_wal_number);
1145
+
1146
+ Status status = ProcessLogFiles(
1147
+ wal_numbers, read_only, is_retry, min_wal_number, job_id, next_sequence,
1148
+ &version_edits, corrupted_wal_found, recovery_ctx);
1149
+
1150
+ FinishLogFilesRecovery(job_id, status);
1151
+ return status;
1152
+ }
1153
+
1154
+ void DBImpl::SetupLogFilesRecovery(
1155
+ const std::vector<uint64_t>& wal_numbers,
1156
+ std::unordered_map<int, VersionEdit>* version_edits, int* job_id,
1157
+ uint64_t* min_wal_number) {
1158
+ assert(version_edits);
1159
+ assert(job_id);
1160
+ assert(min_wal_number);
1161
+ // No need to refcount because iteration is under mutex
1140
1162
  for (auto cfd : *versions_->GetColumnFamilySet()) {
1141
1163
  VersionEdit edit;
1142
1164
  edit.SetColumnFamily(cfd->GetID());
1143
- version_edits.insert({cfd->GetID(), edit});
1165
+ version_edits->insert({cfd->GetID(), edit});
1144
1166
  }
1145
- int job_id = next_job_id_.fetch_add(1);
1167
+
1168
+ *job_id = next_job_id_.fetch_add(1);
1146
1169
  {
1147
1170
  auto stream = event_logger_.Log();
1148
- stream << "job" << job_id << "event"
1149
- << "recovery_started";
1171
+ stream << "job" << *job_id;
1172
+ stream << "event" << "recovery_started";
1150
1173
  stream << "wal_files";
1151
1174
  stream.StartArray();
1152
1175
  for (auto wal_number : wal_numbers) {
@@ -1158,265 +1181,538 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1158
1181
  // No-op for immutable_db_options_.wal_filter == nullptr.
1159
1182
  InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
1160
1183
 
1184
+ *min_wal_number = MinLogNumberToKeep();
1185
+ if (!allow_2pc()) {
1186
+ // In non-2pc mode, we skip WALs that do not back unflushed data.
1187
+ *min_wal_number =
1188
+ std::max(*min_wal_number, versions_->MinLogNumberWithUnflushedData());
1189
+ }
1190
+ }
1191
+
1192
+ Status DBImpl::ProcessLogFiles(
1193
+ const std::vector<uint64_t>& wal_numbers, bool read_only, bool is_retry,
1194
+ uint64_t min_wal_number, int job_id, SequenceNumber* next_sequence,
1195
+ std::unordered_map<int, VersionEdit>* version_edits,
1196
+ bool* corrupted_wal_found, RecoveryContext* recovery_ctx) {
1197
+ Status status;
1198
+
1161
1199
  bool stop_replay_by_wal_filter = false;
1162
1200
  bool stop_replay_for_corruption = false;
1163
1201
  bool flushed = false;
1164
1202
  uint64_t corrupted_wal_number = kMaxSequenceNumber;
1165
- uint64_t min_wal_number = MinLogNumberToKeep();
1166
- if (!allow_2pc()) {
1167
- // In non-2pc mode, we skip WALs that do not back unflushed data.
1168
- min_wal_number =
1169
- std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
1170
- }
1203
+ PredecessorWALInfo predecessor_wal_info;
1204
+
1171
1205
  for (auto wal_number : wal_numbers) {
1172
- if (wal_number < min_wal_number) {
1173
- ROCKS_LOG_INFO(immutable_db_options_.info_log,
1174
- "Skipping log #%" PRIu64
1175
- " since it is older than min log to keep #%" PRIu64,
1176
- wal_number, min_wal_number);
1177
- continue;
1206
+ // Detecting early break on the next iteration after `wal_number` has been
1207
+ // advanced since this `wal_number` doesn't affect follow-up handling after
1208
+ // breaking out of the for loop.
1209
+ if (!status.ok()) {
1210
+ break;
1211
+ }
1212
+ SequenceNumber prev_next_sequence = *next_sequence;
1213
+ if (status.ok()) {
1214
+ status = ProcessLogFile(
1215
+ wal_number, min_wal_number, is_retry, read_only, job_id,
1216
+ next_sequence, &stop_replay_for_corruption,
1217
+ &stop_replay_by_wal_filter, &corrupted_wal_number,
1218
+ corrupted_wal_found, version_edits, &flushed, predecessor_wal_info);
1219
+ }
1220
+ if (status.ok()) {
1221
+ status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence,
1222
+ *next_sequence);
1223
+ }
1224
+ }
1225
+
1226
+ if (status.ok()) {
1227
+ status = MaybeHandleStopReplayForCorruptionForInconsistency(
1228
+ stop_replay_for_corruption, corrupted_wal_number);
1229
+ }
1230
+
1231
+ if (status.ok()) {
1232
+ status = MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
1233
+ wal_numbers, read_only, job_id, flushed, version_edits, recovery_ctx);
1234
+ }
1235
+ return status;
1236
+ }
1237
+
1238
+ Status DBImpl::ProcessLogFile(
1239
+ uint64_t wal_number, uint64_t min_wal_number, bool is_retry, bool read_only,
1240
+ int job_id, SequenceNumber* next_sequence, bool* stop_replay_for_corruption,
1241
+ bool* stop_replay_by_wal_filter, uint64_t* corrupted_wal_number,
1242
+ bool* corrupted_wal_found,
1243
+ std::unordered_map<int, VersionEdit>* version_edits, bool* flushed,
1244
+ PredecessorWALInfo& predecessor_wal_info) {
1245
+ assert(stop_replay_by_wal_filter);
1246
+
1247
+ // Variable initialization starts
1248
+ Status status;
1249
+ bool old_log_record = false;
1250
+
1251
+ DBOpenLogRecordReadReporter reporter;
1252
+ std::unique_ptr<log::Reader> reader;
1253
+
1254
+ std::string fname =
1255
+ LogFileName(immutable_db_options_.GetWalDir(), wal_number);
1256
+
1257
+ auto logFileDropped = [this, &fname]() {
1258
+ uint64_t bytes;
1259
+ if (env_->GetFileSize(fname, &bytes).ok()) {
1260
+ auto info_log = immutable_db_options_.info_log.get();
1261
+ ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
1262
+ static_cast<int>(bytes));
1178
1263
  }
1179
- // The previous incarnation may not have written any MANIFEST
1180
- // records after allocating this log number. So we manually
1181
- // update the file number allocation counter in VersionSet.
1182
- versions_->MarkFileNumberUsed(wal_number);
1183
- // Open the log file
1184
- std::string fname =
1185
- LogFileName(immutable_db_options_.GetWalDir(), wal_number);
1264
+ };
1265
+
1266
+ std::string scratch;
1267
+ Slice record;
1268
+ uint64_t record_checksum;
1269
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz =
1270
+ versions_->GetRunningColumnFamiliesTimestampSize();
1186
1271
 
1272
+ // We need to track `last_seqno_observed` in addition to `next_sequence` since
1273
+ // `last_seqno_observed != *next_sequence` when there are multiple key-value
1274
+ // pairs in one WAL entry
1275
+ SequenceNumber last_seqno_observed = 0;
1276
+ // Variable initialization ends
1277
+
1278
+ if (wal_number < min_wal_number) {
1187
1279
  ROCKS_LOG_INFO(immutable_db_options_.info_log,
1188
- "Recovering log #%" PRIu64 " mode %d", wal_number,
1189
- static_cast<int>(immutable_db_options_.wal_recovery_mode));
1190
- auto logFileDropped = [this, &fname]() {
1191
- uint64_t bytes;
1192
- if (env_->GetFileSize(fname, &bytes).ok()) {
1193
- auto info_log = immutable_db_options_.info_log.get();
1194
- ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
1195
- static_cast<int>(bytes));
1196
- }
1197
- };
1198
- if (stop_replay_by_wal_filter) {
1199
- logFileDropped();
1200
- continue;
1280
+ "Skipping log #%" PRIu64
1281
+ " since it is older than min log to keep #%" PRIu64,
1282
+ wal_number, min_wal_number);
1283
+ assert(status.ok());
1284
+ return status;
1285
+ }
1286
+
1287
+ SetupLogFileProcessing(wal_number);
1288
+
1289
+ if (*stop_replay_by_wal_filter) {
1290
+ logFileDropped();
1291
+ assert(status.ok());
1292
+ return status;
1293
+ }
1294
+
1295
+ Status init_status = InitializeLogReader(
1296
+ wal_number, is_retry, fname, *stop_replay_for_corruption, min_wal_number,
1297
+ predecessor_wal_info, &old_log_record, &status, &reporter, reader);
1298
+
1299
+ // FIXME(hx235): Consolidate `!init_status.ok()` and `reader == nullptr` cases
1300
+ if (!init_status.ok()) {
1301
+ assert(status.ok());
1302
+ status.PermitUncheckedError();
1303
+ return init_status;
1304
+ } else if (reader == nullptr) {
1305
+ // TODO(hx235): remove this case since it's confusing
1306
+ assert(status.ok());
1307
+ // Fail initializing log reader for one log file with an ok status.
1308
+ // Try next one.
1309
+ return status;
1310
+ }
1311
+
1312
+ TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
1313
+ /*cb_arg=*/nullptr);
1314
+ while (true) {
1315
+ if (*stop_replay_by_wal_filter) {
1316
+ break;
1201
1317
  }
1202
1318
 
1203
- std::unique_ptr<SequentialFileReader> file_reader;
1204
- {
1205
- std::unique_ptr<FSSequentialFile> file;
1206
- status = fs_->NewSequentialFile(
1207
- fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
1208
- if (!status.ok()) {
1209
- MaybeIgnoreError(&status);
1210
- if (!status.ok()) {
1211
- return status;
1212
- } else {
1213
- // Fail with one log file, but that's ok.
1214
- // Try next one.
1215
- continue;
1216
- }
1217
- }
1218
- file_reader.reset(new SequentialFileReader(
1219
- std::move(file), fname, immutable_db_options_.log_readahead_size,
1220
- io_tracer_, /*listeners=*/{}, /*rate_limiter=*/nullptr, is_retry));
1221
- }
1222
-
1223
- // Create the log reader.
1224
- LogReporter reporter;
1225
- reporter.env = env_;
1226
- reporter.info_log = immutable_db_options_.info_log.get();
1227
- reporter.fname = fname.c_str();
1228
- reporter.old_log_record = &old_log_record;
1229
- if (!immutable_db_options_.paranoid_checks ||
1230
- immutable_db_options_.wal_recovery_mode ==
1231
- WALRecoveryMode::kSkipAnyCorruptedRecords) {
1232
- reporter.status = nullptr;
1233
- } else {
1234
- reporter.status = &status;
1235
- }
1236
- // We intentially make log::Reader do checksumming even if
1237
- // paranoid_checks==false so that corruptions cause entire commits
1238
- // to be skipped instead of propagating bad information (like overly
1239
- // large sequence numbers).
1240
- log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
1241
- &reporter, true /*checksum*/, wal_number);
1242
-
1243
- // Determine if we should tolerate incomplete records at the tail end of the
1244
- // Read all the records and add to a memtable
1245
- std::string scratch;
1246
- Slice record;
1247
-
1248
- const UnorderedMap<uint32_t, size_t>& running_ts_sz =
1249
- versions_->GetRunningColumnFamiliesTimestampSize();
1250
-
1251
- TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
1252
- /*arg=*/nullptr);
1253
- uint64_t record_checksum;
1254
- while (!stop_replay_by_wal_filter &&
1255
- reader.ReadRecord(&record, &scratch,
1256
- immutable_db_options_.wal_recovery_mode,
1257
- &record_checksum) &&
1258
- status.ok()) {
1259
- if (record.size() < WriteBatchInternal::kHeader) {
1260
- reporter.Corruption(record.size(),
1261
- Status::Corruption("log record too small"));
1262
- continue;
1263
- }
1264
- // We create a new batch and initialize with a valid prot_info_ to store
1265
- // the data checksums
1266
- WriteBatch batch;
1267
- std::unique_ptr<WriteBatch> new_batch;
1319
+ bool read_record = reader->ReadRecord(
1320
+ &record, &scratch, immutable_db_options_.wal_recovery_mode,
1321
+ &record_checksum);
1268
1322
 
1269
- status = WriteBatchInternal::SetContents(&batch, record);
1270
- if (!status.ok()) {
1271
- return status;
1272
- }
1323
+ // `reader->ReadRecord` will change `status` through reporter in `reader`
1324
+ // when a corruption is encountered
1325
+ // FIXME(hx235): consolidate `read_record` and `status`
1326
+ if (!read_record || !status.ok()) {
1327
+ break;
1328
+ }
1273
1329
 
1274
- const UnorderedMap<uint32_t, size_t>& record_ts_sz =
1275
- reader.GetRecordedTimestampSize();
1276
- status = HandleWriteBatchTimestampSizeDifference(
1277
- &batch, running_ts_sz, record_ts_sz,
1278
- TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
1279
- batch_per_txn_, &new_batch);
1280
- if (!status.ok()) {
1281
- return status;
1282
- }
1330
+ // FIXME(hx235): consolidate `process_status` and `status`
1331
+ SequenceNumber prev_next_sequence = *next_sequence;
1332
+ Status process_status = ProcessLogRecord(
1333
+ record, reader, running_ts_sz, wal_number, fname, read_only, job_id,
1334
+ logFileDropped, &reporter, &record_checksum, &last_seqno_observed,
1335
+ next_sequence, stop_replay_for_corruption, &status,
1336
+ stop_replay_by_wal_filter, version_edits, flushed);
1337
+
1338
+ if (!process_status.ok()) {
1339
+ return process_status;
1340
+ } else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery(
1341
+ prev_next_sequence, *next_sequence);
1342
+ !seqno_check_status.ok()) {
1343
+ // Sequence number being set back indicates a serious software bug, the DB
1344
+ // should not be opened in this case.
1345
+ return seqno_check_status;
1346
+ } else if (*stop_replay_for_corruption) {
1347
+ break;
1348
+ }
1349
+ }
1283
1350
 
1284
- bool batch_updated = new_batch != nullptr;
1285
- WriteBatch* batch_to_use = batch_updated ? new_batch.get() : &batch;
1286
- TEST_SYNC_POINT_CALLBACK(
1287
- "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
1288
- batch_to_use);
1289
- TEST_SYNC_POINT_CALLBACK(
1290
- "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
1291
- &record_checksum);
1292
- status = WriteBatchInternal::UpdateProtectionInfo(
1293
- batch_to_use, 8 /* bytes_per_key */,
1294
- batch_updated ? nullptr : &record_checksum);
1295
- if (!status.ok()) {
1296
- return status;
1297
- }
1351
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
1352
+ "Recovered to log #%" PRIu64 " next seq #%" PRIu64, wal_number,
1353
+ *next_sequence);
1298
1354
 
1299
- SequenceNumber sequence = WriteBatchInternal::Sequence(batch_to_use);
1300
- if (sequence > kMaxSequenceNumber) {
1301
- reporter.Corruption(
1302
- record.size(),
1303
- Status::Corruption("sequence " + std::to_string(sequence) +
1304
- " is too large"));
1305
- continue;
1306
- }
1355
+ if (status.ok()) {
1356
+ status = UpdatePredecessorWALInfo(wal_number, last_seqno_observed, fname,
1357
+ predecessor_wal_info);
1358
+ }
1307
1359
 
1308
- if (immutable_db_options_.wal_recovery_mode ==
1309
- WALRecoveryMode::kPointInTimeRecovery) {
1310
- // In point-in-time recovery mode, if sequence id of log files are
1311
- // consecutive, we continue recovery despite corruption. This could
1312
- // happen when we open and write to a corrupted DB, where sequence id
1313
- // will start from the last sequence id we recovered.
1314
- if (sequence == *next_sequence) {
1315
- stop_replay_for_corruption = false;
1316
- }
1317
- if (stop_replay_for_corruption) {
1318
- logFileDropped();
1319
- break;
1320
- }
1321
- }
1360
+ if (!status.ok() || old_log_record) {
1361
+ status = HandleNonOkStatusOrOldLogRecord(
1362
+ wal_number, next_sequence, status, reporter, &old_log_record,
1363
+ stop_replay_for_corruption, corrupted_wal_number, corrupted_wal_found);
1364
+ }
1322
1365
 
1323
- // For the default case of wal_filter == nullptr, always performs no-op
1324
- // and returns true.
1325
- if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter,
1326
- status, stop_replay_by_wal_filter,
1327
- *batch_to_use)) {
1328
- continue;
1329
- }
1366
+ FinishLogFileProcessing(status, next_sequence);
1330
1367
 
1331
- // If column family was not found, it might mean that the WAL write
1332
- // batch references to the column family that was dropped after the
1333
- // insert. We don't want to fail the whole write batch in that case --
1334
- // we just ignore the update.
1335
- // That's why we set ignore missing column families to true
1336
- bool has_valid_writes = false;
1337
- status = WriteBatchInternal::InsertInto(
1338
- batch_to_use, column_family_memtables_.get(), &flush_scheduler_,
1339
- &trim_history_scheduler_, true, wal_number, this,
1340
- false /* concurrent_memtable_writes */, next_sequence,
1341
- &has_valid_writes, seq_per_batch_, batch_per_txn_);
1342
- MaybeIgnoreError(&status);
1343
- if (!status.ok()) {
1344
- // We are treating this as a failure while reading since we read valid
1345
- // blocks that do not form coherent data
1346
- reporter.Corruption(record.size(), status);
1347
- continue;
1348
- }
1368
+ return status;
1369
+ }
1349
1370
 
1350
- if (has_valid_writes && !read_only) {
1351
- // we can do this because this is called before client has access to the
1352
- // DB and there is only a single thread operating on DB
1353
- ColumnFamilyData* cfd;
1354
-
1355
- while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
1356
- cfd->UnrefAndTryDelete();
1357
- // If this asserts, it means that InsertInto failed in
1358
- // filtering updates to already-flushed column families
1359
- assert(cfd->GetLogNumber() <= wal_number);
1360
- auto iter = version_edits.find(cfd->GetID());
1361
- assert(iter != version_edits.end());
1362
- VersionEdit* edit = &iter->second;
1363
- status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
1364
- if (!status.ok()) {
1365
- // Reflect errors immediately so that conditions like full
1366
- // file-systems cause the DB::Open() to fail.
1367
- return status;
1368
- }
1369
- flushed = true;
1371
+ void DBImpl::SetupLogFileProcessing(uint64_t wal_number) {
1372
+ // The previous incarnation may not have written any MANIFEST
1373
+ // records after allocating this log number. So we manually
1374
+ // update the file number allocation counter in VersionSet.
1375
+ versions_->MarkFileNumberUsed(wal_number);
1370
1376
 
1371
- cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
1372
- *next_sequence - 1);
1373
- }
1374
- }
1377
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
1378
+ "Recovering log #%" PRIu64 " mode %d", wal_number,
1379
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
1380
+ }
1381
+
1382
+ Status DBImpl::InitializeLogReader(
1383
+ uint64_t wal_number, bool is_retry, std::string& fname,
1384
+ bool stop_replay_for_corruption, uint64_t min_wal_number,
1385
+ const PredecessorWALInfo& predecessor_wal_info, bool* const old_log_record,
1386
+ Status* const reporter_status, DBOpenLogRecordReadReporter* reporter,
1387
+ std::unique_ptr<log::Reader>& reader) {
1388
+ assert(old_log_record);
1389
+ assert(reporter_status);
1390
+ assert(reporter);
1391
+
1392
+ Status status;
1393
+
1394
+ std::unique_ptr<SequentialFileReader> file_reader;
1395
+ {
1396
+ std::unique_ptr<FSSequentialFile> file;
1397
+ status = fs_->NewSequentialFile(
1398
+ fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
1399
+ if (!status.ok()) {
1400
+ MaybeIgnoreError(&status);
1401
+ return status;
1375
1402
  }
1376
- ROCKS_LOG_INFO(immutable_db_options_.info_log,
1377
- "Recovered to log #%" PRIu64 " seq #%" PRIu64, wal_number,
1378
- *next_sequence);
1379
-
1380
- if (!status.ok() || old_log_record) {
1381
- if (status.IsNotSupported()) {
1382
- // We should not treat NotSupported as corruption. It is rather a clear
1383
- // sign that we are processing a WAL that is produced by an incompatible
1384
- // version of the code.
1385
- return status;
1386
- }
1387
- if (immutable_db_options_.wal_recovery_mode ==
1403
+ file_reader.reset(new SequentialFileReader(
1404
+ std::move(file), fname, immutable_db_options_.log_readahead_size,
1405
+ io_tracer_, /*listeners=*/{}, /*rate_limiter=*/nullptr,
1406
+ /*verify_and_reconstruct_read=*/is_retry));
1407
+ }
1408
+
1409
+ // Create the log reader.
1410
+ reporter->env = env_;
1411
+ reporter->info_log = immutable_db_options_.info_log.get();
1412
+ reporter->fname = fname.c_str();
1413
+ reporter->old_log_record = old_log_record;
1414
+ if (!immutable_db_options_.paranoid_checks ||
1415
+ immutable_db_options_.wal_recovery_mode ==
1388
1416
  WALRecoveryMode::kSkipAnyCorruptedRecords) {
1389
- // We should ignore all errors unconditionally
1390
- status = Status::OK();
1391
- } else if (immutable_db_options_.wal_recovery_mode ==
1392
- WALRecoveryMode::kPointInTimeRecovery) {
1393
- if (status.IsIOError()) {
1394
- ROCKS_LOG_ERROR(immutable_db_options_.info_log,
1395
- "IOError during point-in-time reading log #%" PRIu64
1396
- " seq #%" PRIu64
1397
- ". %s. This likely mean loss of synced WAL, "
1398
- "thus recovery fails.",
1399
- wal_number, *next_sequence,
1400
- status.ToString().c_str());
1401
- return status;
1402
- }
1403
- // We should ignore the error but not continue replaying
1404
- status = Status::OK();
1405
- old_log_record = false;
1406
- stop_replay_for_corruption = true;
1407
- corrupted_wal_number = wal_number;
1408
- if (corrupted_wal_found != nullptr) {
1409
- *corrupted_wal_found = true;
1410
- }
1411
- } else {
1412
- assert(immutable_db_options_.wal_recovery_mode ==
1413
- WALRecoveryMode::kTolerateCorruptedTailRecords ||
1414
- immutable_db_options_.wal_recovery_mode ==
1415
- WALRecoveryMode::kAbsoluteConsistency);
1417
+ reporter->status = nullptr;
1418
+ } else {
1419
+ reporter->status = reporter_status;
1420
+ }
1421
+ // We intentially make log::Reader do checksumming even if
1422
+ // paranoid_checks==false so that corruptions cause entire commits
1423
+ // to be skipped instead of propagating bad information (like overly
1424
+ // large sequence numbers).
1425
+ reader.reset(new log::Reader(
1426
+ immutable_db_options_.info_log, std::move(file_reader), reporter,
1427
+ true /*checksum*/, wal_number,
1428
+ immutable_db_options_.track_and_verify_wals, stop_replay_for_corruption,
1429
+ min_wal_number, predecessor_wal_info));
1430
+ return status;
1431
+ }
1432
+
1433
+ Status DBImpl::ProcessLogRecord(
1434
+ Slice record, const std::unique_ptr<log::Reader>& reader,
1435
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz, uint64_t wal_number,
1436
+ const std::string& fname, bool read_only, int job_id,
1437
+ const std::function<void()>& logFileDropped,
1438
+ DBOpenLogRecordReadReporter* reporter, uint64_t* record_checksum,
1439
+ SequenceNumber* last_seqno_observed, SequenceNumber* next_sequence,
1440
+ bool* stop_replay_for_corruption, Status* status,
1441
+ bool* stop_replay_by_wal_filter,
1442
+ std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) {
1443
+ assert(reporter);
1444
+ assert(last_seqno_observed);
1445
+ assert(stop_replay_for_corruption);
1446
+ assert(status);
1447
+ assert(stop_replay_by_wal_filter);
1448
+
1449
+ Status process_status;
1450
+ bool has_valid_writes = false;
1451
+ WriteBatch batch;
1452
+ std::unique_ptr<WriteBatch> new_batch;
1453
+ WriteBatch* batch_to_use = nullptr;
1454
+
1455
+ if (record.size() < WriteBatchInternal::kHeader) {
1456
+ reporter->Corruption(record.size(),
1457
+ Status::Corruption("log record too small"));
1458
+ assert(process_status.ok());
1459
+ return process_status;
1460
+ }
1461
+
1462
+ process_status = InitializeWriteBatchForLogRecord(
1463
+ record, reader, running_ts_sz, &batch, new_batch, batch_to_use,
1464
+ record_checksum);
1465
+ if (!process_status.ok()) {
1466
+ return process_status;
1467
+ }
1468
+ assert(batch_to_use);
1469
+
1470
+ *last_seqno_observed = WriteBatchInternal::Sequence(batch_to_use);
1471
+
1472
+ if (*last_seqno_observed > kMaxSequenceNumber) {
1473
+ reporter->Corruption(
1474
+ record.size(),
1475
+ Status::Corruption("sequence " + std::to_string(*last_seqno_observed) +
1476
+ " is too large"));
1477
+ assert(process_status.ok());
1478
+ return process_status;
1479
+ }
1480
+
1481
+ MaybeReviseStopReplayForCorruption(*last_seqno_observed, next_sequence,
1482
+ stop_replay_for_corruption);
1483
+ if (*stop_replay_for_corruption) {
1484
+ logFileDropped();
1485
+ assert(process_status.ok());
1486
+ return process_status;
1487
+ }
1488
+
1489
+ // For the default case of wal_filter == nullptr, always performs no-op
1490
+ // and returns true.
1491
+ if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, *reporter, *status,
1492
+ *stop_replay_by_wal_filter,
1493
+ *batch_to_use)) {
1494
+ assert(process_status.ok());
1495
+ return process_status;
1496
+ } else {
1497
+ // FIXME(hx235): Handle the potential non-okay `status` when
1498
+ // `InvokeWalFilterIfNeededOnWalRecord()` returns true
1499
+ status->PermitUncheckedError();
1500
+ }
1501
+
1502
+ assert(process_status.ok());
1503
+ process_status = InsertLogRecordToMemtable(batch_to_use, wal_number,
1504
+ next_sequence, &has_valid_writes);
1505
+ MaybeIgnoreError(&process_status);
1506
+ // We are treating this as a failure while reading since we read valid
1507
+ // blocks that do not form coherent data
1508
+ if (!process_status.ok()) {
1509
+ // FIXME(hx235): `reporter->Corruption()` will override the non-ok status
1510
+ // set in `InvokeWalFilterIfNeededOnWalRecord` through passing `*status`
1511
+ reporter->Corruption(record.size(), process_status);
1512
+ process_status = Status::OK();
1513
+ return process_status;
1514
+ }
1515
+
1516
+ process_status = MaybeWriteLevel0TableForRecovery(
1517
+ has_valid_writes, read_only, wal_number, job_id, next_sequence,
1518
+ version_edits, flushed);
1519
+
1520
+ return process_status;
1521
+ }
1522
+
1523
+ // We create a new batch and initialize with a valid prot_info_ to store
1524
+ // the data checksum
1525
+ Status DBImpl::InitializeWriteBatchForLogRecord(
1526
+ Slice record, const std::unique_ptr<log::Reader>& reader,
1527
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz, WriteBatch* batch,
1528
+ std::unique_ptr<WriteBatch>& new_batch, WriteBatch*& batch_to_use,
1529
+ uint64_t* record_checksum) {
1530
+ assert(batch);
1531
+ assert(record_checksum);
1532
+
1533
+ Status status = WriteBatchInternal::SetContents(batch, record);
1534
+ if (!status.ok()) {
1535
+ return status;
1536
+ }
1537
+
1538
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz =
1539
+ reader->GetRecordedTimestampSize();
1540
+ status = HandleWriteBatchTimestampSizeDifference(
1541
+ batch, running_ts_sz, record_ts_sz,
1542
+ TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
1543
+ batch_per_txn_, &new_batch);
1544
+ if (!status.ok()) {
1545
+ return status;
1546
+ }
1547
+
1548
+ bool batch_updated = new_batch != nullptr;
1549
+ batch_to_use = batch_updated ? new_batch.get() : batch;
1550
+ TEST_SYNC_POINT_CALLBACK(
1551
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", batch_to_use);
1552
+ TEST_SYNC_POINT_CALLBACK(
1553
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
1554
+ record_checksum);
1555
+ status = WriteBatchInternal::UpdateProtectionInfo(
1556
+ batch_to_use, 8 /* bytes_per_key */,
1557
+ batch_updated ? nullptr : record_checksum);
1558
+
1559
+ return status;
1560
+ }
1561
+
1562
+ void DBImpl::MaybeReviseStopReplayForCorruption(
1563
+ SequenceNumber sequence, SequenceNumber const* const next_sequence,
1564
+ bool* stop_replay_for_corruption) {
1565
+ if (immutable_db_options_.wal_recovery_mode ==
1566
+ WALRecoveryMode::kPointInTimeRecovery) {
1567
+ assert(next_sequence);
1568
+ assert(stop_replay_for_corruption);
1569
+ // In point-in-time recovery mode, if sequence id of log files are
1570
+ // consecutive, we continue recovery despite corruption. This could
1571
+ // happen when we open and write to a corrupted DB, where sequence id
1572
+ // will start from the last sequence id we recovered.
1573
+ if (sequence == *next_sequence) {
1574
+ *stop_replay_for_corruption = false;
1575
+ }
1576
+ }
1577
+ }
1578
+
1579
+ Status DBImpl::InsertLogRecordToMemtable(WriteBatch* batch_to_use,
1580
+ uint64_t wal_number,
1581
+ SequenceNumber* next_sequence,
1582
+ bool* has_valid_writes) {
1583
+ // If column family was not found, it might mean that the WAL write
1584
+ // batch references to the column family that was dropped after the
1585
+ // insert. We don't want to fail the whole write batch in that case --
1586
+ // we just ignore the update.
1587
+ // That's why we set ignore missing column families to true
1588
+ assert(batch_to_use);
1589
+ assert(has_valid_writes);
1590
+ Status status = WriteBatchInternal::InsertInto(
1591
+ batch_to_use, column_family_memtables_.get(), &flush_scheduler_,
1592
+ &trim_history_scheduler_, true, wal_number, this,
1593
+ false /* concurrent_memtable_writes */, next_sequence, has_valid_writes,
1594
+ seq_per_batch_, batch_per_txn_);
1595
+ return status;
1596
+ }
1597
+
1598
+ Status DBImpl::MaybeWriteLevel0TableForRecovery(
1599
+ bool has_valid_writes, bool read_only, uint64_t wal_number, int job_id,
1600
+ SequenceNumber const* const next_sequence,
1601
+ std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) {
1602
+ assert(next_sequence);
1603
+ assert(version_edits);
1604
+ assert(flushed);
1605
+
1606
+ Status status;
1607
+ if (has_valid_writes && !read_only) {
1608
+ // we can do this because this is called before client has access to the
1609
+ // DB and there is only a single thread operating on DB
1610
+ ColumnFamilyData* cfd;
1611
+
1612
+ while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
1613
+ cfd->UnrefAndTryDelete();
1614
+ // If this asserts, it means that InsertInto failed in
1615
+ // filtering updates to already-flushed column families
1616
+ assert(cfd->GetLogNumber() <= wal_number);
1617
+ (void)wal_number;
1618
+ auto iter = version_edits->find(cfd->GetID());
1619
+ assert(iter != version_edits->end());
1620
+ VersionEdit* edit = &iter->second;
1621
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
1622
+ if (!status.ok()) {
1623
+ // Reflect errors immediately so that conditions like full
1624
+ // file-systems cause the DB::Open() to fail.
1416
1625
  return status;
1417
1626
  }
1627
+ *flushed = true;
1628
+
1629
+ cfd->CreateNewMemtable(*next_sequence - 1);
1630
+ }
1631
+ }
1632
+ return status;
1633
+ }
1634
+
1635
+ Status DBImpl::HandleNonOkStatusOrOldLogRecord(
1636
+ uint64_t wal_number, SequenceNumber const* const next_sequence,
1637
+ Status status, const DBOpenLogRecordReadReporter& reporter,
1638
+ bool* old_log_record, bool* stop_replay_for_corruption,
1639
+ uint64_t* corrupted_wal_number, bool* corrupted_wal_found) {
1640
+ assert(!status.ok() || *old_log_record);
1641
+
1642
+ assert(next_sequence);
1643
+ assert(old_log_record);
1644
+ assert(stop_replay_for_corruption);
1645
+ assert(corrupted_wal_number);
1646
+
1647
+ if (status.IsNotSupported()) {
1648
+ // We should not treat NotSupported as corruption. It is rather a clear
1649
+ // sign that we are processing a WAL that is produced by an incompatible
1650
+ // version of the code.
1651
+ return status;
1652
+ }
1653
+
1654
+ if (immutable_db_options_.wal_recovery_mode ==
1655
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
1656
+ // We should ignore all errors unconditionally
1657
+ return Status::OK();
1658
+ } else if (immutable_db_options_.wal_recovery_mode ==
1659
+ WALRecoveryMode::kPointInTimeRecovery) {
1660
+ if (status.IsIOError()) {
1661
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
1662
+ "IOError during point-in-time reading log #%" PRIu64
1663
+ " seq #%" PRIu64
1664
+ ". %s. This likely mean loss of synced WAL, "
1665
+ "thus recovery fails.",
1666
+ wal_number, *next_sequence, status.ToString().c_str());
1667
+ return status;
1668
+ }
1669
+ // We should ignore the error but not continue replaying
1670
+ *old_log_record = false;
1671
+ *stop_replay_for_corruption = true;
1672
+ // TODO(hx235): have a single source of corrupted WAL number once we
1673
+ // consolidate the statuses
1674
+ uint64_t reporter_corrupted_wal_number = reporter.GetCorruptedLogNumber();
1675
+ *corrupted_wal_number = reporter_corrupted_wal_number != kMaxSequenceNumber
1676
+ ? reporter_corrupted_wal_number
1677
+ : wal_number;
1678
+ if (corrupted_wal_found != nullptr) {
1679
+ *corrupted_wal_found = true;
1418
1680
  }
1681
+ return Status::OK();
1682
+ } else {
1683
+ assert(immutable_db_options_.wal_recovery_mode ==
1684
+ WALRecoveryMode::kTolerateCorruptedTailRecords ||
1685
+ immutable_db_options_.wal_recovery_mode ==
1686
+ WALRecoveryMode::kAbsoluteConsistency);
1687
+ return status;
1688
+ }
1689
+ }
1690
+
1691
+ Status DBImpl::UpdatePredecessorWALInfo(
1692
+ uint64_t wal_number, const SequenceNumber last_seqno_observed,
1693
+ const std::string& fname, PredecessorWALInfo& predecessor_wal_info) {
1694
+ uint64_t bytes;
1419
1695
 
1696
+ Status s = env_->GetFileSize(fname, &bytes);
1697
+ if (!s.ok()) {
1698
+ return s;
1699
+ }
1700
+
1701
+ SequenceNumber mock_seqno = kMaxSequenceNumber;
1702
+ [[maybe_unused]] std::pair<uint64_t, SequenceNumber*> pair =
1703
+ std::make_pair(wal_number, &mock_seqno);
1704
+ TEST_SYNC_POINT_CALLBACK("DBImpl::UpdatePredecessorWALInfo", &pair);
1705
+ predecessor_wal_info = PredecessorWALInfo(
1706
+ wal_number, bytes,
1707
+ mock_seqno != kMaxSequenceNumber ? mock_seqno : last_seqno_observed);
1708
+
1709
+ return s;
1710
+ }
1711
+
1712
+ void DBImpl::FinishLogFileProcessing(const Status& status,
1713
+ const SequenceNumber* next_sequence) {
1714
+ if (status.ok()) {
1715
+ assert(next_sequence);
1420
1716
  flush_scheduler_.Clear();
1421
1717
  trim_history_scheduler_.Clear();
1422
1718
  auto last_sequence = *next_sequence - 1;
@@ -1427,6 +1723,12 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1427
1723
  versions_->SetLastSequence(last_sequence);
1428
1724
  }
1429
1725
  }
1726
+ }
1727
+
1728
+ Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency(
1729
+ bool stop_replay_for_corruption, uint64_t corrupted_wal_number) {
1730
+ Status status;
1731
+
1430
1732
  // Compare the corrupted log number to all columnfamily's current log number.
1431
1733
  // Abort Open() if any column family's log number is greater than
1432
1734
  // the corrupted log number, which means CF contains data beyond the point of
@@ -1462,12 +1764,22 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1462
1764
  ROCKS_LOG_ERROR(immutable_db_options_.info_log,
1463
1765
  "Column family inconsistency: SST file contains data"
1464
1766
  " beyond the point of corruption.");
1465
- return Status::Corruption("SST file is ahead of WALs in CF " +
1466
- cfd->GetName());
1767
+ status = Status::Corruption("SST file is ahead of WALs in CF " +
1768
+ cfd->GetName());
1769
+ return status;
1467
1770
  }
1468
1771
  }
1469
1772
  }
1773
+ return status;
1774
+ }
1775
+
1776
+ Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
1777
+ const std::vector<uint64_t>& wal_numbers, bool read_only, int job_id,
1778
+ bool flushed, std::unordered_map<int, VersionEdit>* version_edits,
1779
+ RecoveryContext* recovery_ctx) {
1780
+ assert(version_edits);
1470
1781
 
1782
+ Status status;
1471
1783
  // True if there's any data in the WALs; if not, we can skip re-processing
1472
1784
  // them later
1473
1785
  bool data_seen = false;
@@ -1476,8 +1788,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1476
1788
  // to the DB and can not drop column families while we iterate
1477
1789
  const WalNumber max_wal_number = wal_numbers.back();
1478
1790
  for (auto cfd : *versions_->GetColumnFamilySet()) {
1479
- auto iter = version_edits.find(cfd->GetID());
1480
- assert(iter != version_edits.end());
1791
+ auto iter = version_edits->find(cfd->GetID());
1792
+ assert(iter != version_edits->end());
1481
1793
  VersionEdit* edit = &iter->second;
1482
1794
 
1483
1795
  if (cfd->GetLogNumber() > max_wal_number) {
@@ -1506,8 +1818,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1506
1818
  }
1507
1819
  flushed = true;
1508
1820
 
1509
- cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
1510
- versions_->LastSequence());
1821
+ cfd->CreateNewMemtable(versions_->LastSequence());
1511
1822
  }
1512
1823
  data_seen = true;
1513
1824
  }
@@ -1533,8 +1844,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1533
1844
  assert(recovery_ctx != nullptr);
1534
1845
 
1535
1846
  for (auto* cfd : *versions_->GetColumnFamilySet()) {
1536
- auto iter = version_edits.find(cfd->GetID());
1537
- assert(iter != version_edits.end());
1847
+ auto iter = version_edits->find(cfd->GetID());
1848
+ assert(iter != version_edits->end());
1538
1849
  recovery_ctx->UpdateVersionEdits(cfd, iter->second);
1539
1850
  }
1540
1851
 
@@ -1567,16 +1878,32 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1567
1878
  .PermitUncheckedError();
1568
1879
  }
1569
1880
  }
1881
+ return status;
1882
+ }
1883
+
1884
+ Status DBImpl::CheckSeqnoNotSetBackDuringRecovery(
1885
+ SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) {
1886
+ if (prev_next_seqno == kMaxSequenceNumber ||
1887
+ prev_next_seqno <= current_next_seqno) {
1888
+ return Status::OK();
1889
+ }
1890
+ std::string msg =
1891
+ "Sequence number is being set backwards during recovery, this is likely "
1892
+ "a software bug or a data corruption. Prev next seqno: " +
1893
+ std::to_string(prev_next_seqno) +
1894
+ " , current next seqno: " + std::to_string(current_next_seqno);
1895
+ return Status::Corruption(msg);
1896
+ }
1570
1897
 
1898
+ void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
1571
1899
  event_logger_.Log() << "job" << job_id << "event"
1572
- << "recovery_finished";
1573
-
1574
- return status;
1900
+ << (status.ok() ? "recovery_finished" : "recovery_failed")
1901
+ << "status" << status.ToString();
1575
1902
  }
1576
1903
 
1577
1904
  Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
1578
- LogFileNumberSize* log_ptr) {
1579
- LogFileNumberSize log(wal_number);
1905
+ WalFileNumberSize* log_ptr) {
1906
+ WalFileNumberSize log(wal_number);
1580
1907
  std::string fname =
1581
1908
  LogFileName(immutable_db_options_.GetWalDir(), wal_number);
1582
1909
  Status s;
@@ -1619,27 +1946,27 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
1619
1946
  assert(immutable_db_options_.avoid_flush_during_recovery);
1620
1947
  // Mark these as alive so they'll be considered for deletion later by
1621
1948
  // FindObsoleteFiles()
1622
- total_log_size_ = 0;
1623
- log_empty_ = false;
1949
+ wals_total_size_.StoreRelaxed(0);
1950
+ wal_empty_ = false;
1624
1951
  uint64_t min_wal_with_unflushed_data =
1625
1952
  versions_->MinLogNumberWithUnflushedData();
1626
1953
  for (auto wal_number : wal_numbers) {
1627
1954
  if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
1628
1955
  // In non-2pc mode, the WAL files not backing unflushed data are not
1629
- // alive, thus should not be added to the alive_log_files_.
1956
+ // alive, thus should not be added to the alive_wal_files_.
1630
1957
  continue;
1631
1958
  }
1632
1959
  // We preallocate space for wals, but then after a crash and restart, those
1633
1960
  // preallocated space are not needed anymore. It is likely only the last
1634
1961
  // log has such preallocated space, so we only truncate for the last log.
1635
- LogFileNumberSize log;
1962
+ WalFileNumberSize log;
1636
1963
  s = GetLogSizeAndMaybeTruncate(
1637
1964
  wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
1638
1965
  if (!s.ok()) {
1639
1966
  break;
1640
1967
  }
1641
- total_log_size_ += log.size;
1642
- alive_log_files_.push_back(log);
1968
+ wals_total_size_.FetchAddRelaxed(log.size);
1969
+ alive_wal_files_.push_back(log);
1643
1970
  }
1644
1971
  return s;
1645
1972
  }
@@ -1672,7 +1999,10 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1672
1999
  assert(ucmp);
1673
2000
  const size_t ts_sz = ucmp->timestamp_size();
1674
2001
  const bool logical_strip_timestamp =
1675
- ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps;
2002
+ ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps;
2003
+ // Note that here we treat flush as level 0 compaction in internal stats
2004
+ InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
2005
+ 1 /* count */);
1676
2006
  {
1677
2007
  ScopedArenaPtr<InternalIterator> iter(
1678
2008
  logical_strip_timestamp
@@ -1688,10 +2018,10 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1688
2018
  cfd->GetName().c_str(), meta.fd.GetNumber());
1689
2019
 
1690
2020
  // Get the latest mutable cf options while the mutex is still locked
1691
- const MutableCFOptions mutable_cf_options =
1692
- *cfd->GetLatestMutableCFOptions();
2021
+ const MutableCFOptions mutable_cf_options_copy =
2022
+ cfd->GetLatestMutableCFOptions();
1693
2023
  bool paranoid_file_checks =
1694
- cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
2024
+ cfd->GetLatestMutableCFOptions().paranoid_file_checks;
1695
2025
 
1696
2026
  int64_t _current_time = 0;
1697
2027
  immutable_db_options_.clock->GetCurrentTime(&_current_time)
@@ -1700,8 +2030,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1700
2030
  meta.oldest_ancester_time = current_time;
1701
2031
  meta.epoch_number = cfd->NewEpochNumber();
1702
2032
  {
1703
- auto write_hint =
1704
- cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
2033
+ auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
2034
+ /*level=*/0,
2035
+ immutable_db_options_.calculate_sst_write_lifetime_hint_set);
1705
2036
  mutex_.Unlock();
1706
2037
 
1707
2038
  SequenceNumber earliest_write_conflict_snapshot;
@@ -1733,30 +2064,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1733
2064
  const WriteOptions write_option(Env::IO_HIGH, Env::IOActivity::kDBOpen);
1734
2065
 
1735
2066
  TableBuilderOptions tboptions(
1736
- *cfd->ioptions(), mutable_cf_options, read_option, write_option,
2067
+ cfd->ioptions(), mutable_cf_options_copy, read_option, write_option,
1737
2068
  cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
1738
- GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
1739
- mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
1740
- 0 /* level */, current_time /* newest_key_time */,
2069
+ GetCompressionFlush(cfd->ioptions(), mutable_cf_options_copy),
2070
+ mutable_cf_options_copy.compression_opts, cfd->GetID(),
2071
+ cfd->GetName(), 0 /* level */, current_time /* newest_key_time */,
1741
2072
  false /* is_bottommost */, TableFileCreationReason::kRecovery,
1742
2073
  0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
1743
2074
  db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
1744
2075
  kMaxSequenceNumber);
1745
2076
  Version* version = cfd->current();
1746
2077
  version->Ref();
1747
- uint64_t num_input_entries = 0;
1748
- s = BuildTable(dbname_, versions_.get(), immutable_db_options_, tboptions,
1749
- file_options_for_compaction_, cfd->table_cache(),
1750
- iter.get(), std::move(range_del_iters), &meta,
1751
- &blob_file_additions, snapshot_seqs, earliest_snapshot,
1752
- earliest_write_conflict_snapshot, kMaxSequenceNumber,
1753
- snapshot_checker, paranoid_file_checks,
1754
- cfd->internal_stats(), &io_s, io_tracer_,
1755
- BlobFileCreationReason::kRecovery,
1756
- nullptr /* seqno_to_time_mapping */, &event_logger_,
1757
- job_id, nullptr /* table_properties */, write_hint,
1758
- nullptr /*full_history_ts_low*/, &blob_callback_, version,
1759
- &num_input_entries);
2078
+ TableProperties temp_table_proerties;
2079
+ s = BuildTable(
2080
+ dbname_, versions_.get(), immutable_db_options_, tboptions,
2081
+ file_options_for_compaction_, cfd->table_cache(), iter.get(),
2082
+ std::move(range_del_iters), &meta, &blob_file_additions,
2083
+ snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot,
2084
+ kMaxSequenceNumber, snapshot_checker, paranoid_file_checks,
2085
+ cfd->internal_stats(), &io_s, io_tracer_,
2086
+ BlobFileCreationReason::kRecovery,
2087
+ nullptr /* seqno_to_time_mapping */, &event_logger_, job_id,
2088
+ &temp_table_proerties /* table_properties */, write_hint,
2089
+ nullptr /*full_history_ts_low*/, &blob_callback_, version,
2090
+ nullptr /* memtable_payload_bytes */,
2091
+ nullptr /* memtable_garbage_bytes */, &flush_stats);
1760
2092
  version->Unref();
1761
2093
  LogFlush(immutable_db_options_.info_log);
1762
2094
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
@@ -1772,10 +2104,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1772
2104
  }
1773
2105
 
1774
2106
  uint64_t total_num_entries = mem->NumEntries();
1775
- if (s.ok() && total_num_entries != num_input_entries) {
2107
+ if (s.ok() && total_num_entries != flush_stats.num_input_records) {
1776
2108
  std::string msg = "Expected " + std::to_string(total_num_entries) +
1777
2109
  " entries in memtable, but read " +
1778
- std::to_string(num_input_entries);
2110
+ std::to_string(flush_stats.num_input_records);
2111
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
2112
+ "[%s] [JOB %d] Level-0 flush during recover: %s",
2113
+ cfd->GetName().c_str(), job_id, msg.c_str());
2114
+ if (immutable_db_options_.flush_verify_memtable_count) {
2115
+ s = Status::Corruption(msg);
2116
+ }
2117
+ }
2118
+ // Only verify on table with format collects table properties
2119
+ const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
2120
+ if (s.ok() &&
2121
+ (mutable_cf_options.table_factory->IsInstanceOf(
2122
+ TableFactory::kBlockBasedTableName()) ||
2123
+ mutable_cf_options.table_factory->IsInstanceOf(
2124
+ TableFactory::kPlainTableName())) &&
2125
+ flush_stats.num_output_records != temp_table_proerties.num_entries) {
2126
+ std::string msg =
2127
+ "Number of keys in flush output SST files does not match "
2128
+ "number of keys added to the table. Expected " +
2129
+ std::to_string(flush_stats.num_output_records) + " but there are " +
2130
+ std::to_string(temp_table_proerties.num_entries) +
2131
+ " in output SST files";
1779
2132
  ROCKS_LOG_WARN(immutable_db_options_.info_log,
1780
2133
  "[%s] [JOB %d] Level-0 flush during recover: %s",
1781
2134
  cfd->GetName().c_str(), job_id, msg.c_str());
@@ -1823,30 +2176,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1823
2176
  }
1824
2177
  }
1825
2178
 
1826
- InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
1827
- stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
2179
+ flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
1828
2180
 
1829
2181
  if (has_output) {
1830
- stats.bytes_written = meta.fd.GetFileSize();
1831
- stats.num_output_files = 1;
2182
+ flush_stats.bytes_written = meta.fd.GetFileSize();
2183
+ flush_stats.num_output_files = 1;
1832
2184
  }
1833
2185
 
1834
2186
  const auto& blobs = edit->GetBlobFileAdditions();
1835
2187
  for (const auto& blob : blobs) {
1836
- stats.bytes_written_blob += blob.GetTotalBlobBytes();
2188
+ flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
1837
2189
  }
1838
2190
 
1839
- stats.num_output_files_blob = static_cast<int>(blobs.size());
2191
+ flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
1840
2192
 
1841
- cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
2193
+ cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER,
2194
+ flush_stats);
1842
2195
  cfd->internal_stats()->AddCFStats(
1843
2196
  InternalStats::BYTES_FLUSHED,
1844
- stats.bytes_written + stats.bytes_written_blob);
2197
+ flush_stats.bytes_written + flush_stats.bytes_written_blob);
1845
2198
  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
1846
2199
  return s;
1847
2200
  }
1848
2201
 
1849
- Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
2202
+ Status DB::Open(const Options& options, const std::string& dbname,
2203
+ std::unique_ptr<DB>* dbptr) {
1850
2204
  DBOptions db_options(options);
1851
2205
  ColumnFamilyOptions cf_options(options);
1852
2206
  std::vector<ColumnFamilyDescriptor> column_families;
@@ -1874,7 +2228,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
1874
2228
 
1875
2229
  Status DB::Open(const DBOptions& db_options, const std::string& dbname,
1876
2230
  const std::vector<ColumnFamilyDescriptor>& column_families,
1877
- std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
2231
+ std::vector<ColumnFamilyHandle*>* handles,
2232
+ std::unique_ptr<DB>* dbptr) {
1878
2233
  const bool kSeqPerBatch = true;
1879
2234
  const bool kBatchPerTxn = true;
1880
2235
  ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking);
@@ -1896,7 +2251,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
1896
2251
  Status DB::OpenAndTrimHistory(
1897
2252
  const DBOptions& db_options, const std::string& dbname,
1898
2253
  const std::vector<ColumnFamilyDescriptor>& column_families,
1899
- std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
2254
+ std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
1900
2255
  std::string trim_ts) {
1901
2256
  assert(dbptr != nullptr);
1902
2257
  assert(handles != nullptr);
@@ -1951,13 +2306,14 @@ Status DB::OpenAndTrimHistory(
1951
2306
  return s;
1952
2307
  }
1953
2308
 
1954
- *dbptr = db;
2309
+ dbptr->reset(db);
1955
2310
  return s;
1956
2311
  }
1957
2312
 
1958
2313
  IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
1959
2314
  uint64_t log_file_num, uint64_t recycle_log_number,
1960
2315
  size_t preallocate_block_size,
2316
+ const PredecessorWALInfo& predecessor_wal_info,
1961
2317
  log::Writer** new_log) {
1962
2318
  IOStatus io_s;
1963
2319
  std::unique_ptr<FSWritableFile> lfile;
@@ -1966,6 +2322,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
1966
2322
  BuildDBOptions(immutable_db_options_, mutable_db_options_);
1967
2323
  FileOptions opt_file_options =
1968
2324
  fs_->OptimizeForLogWrite(file_options_, db_options);
2325
+ opt_file_options.write_hint = CalculateWALWriteHint();
1969
2326
  // DB option takes precedence when not kUnknown
1970
2327
  if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
1971
2328
  opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
@@ -1987,7 +2344,9 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
1987
2344
  }
1988
2345
 
1989
2346
  if (io_s.ok()) {
1990
- lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
2347
+ // Subsequent attempts to override the hint via SetWriteLifeTimeHint
2348
+ // with the very same value will be ignored by the fs.
2349
+ lfile->SetWriteLifeTimeHint(opt_file_options.write_hint);
1991
2350
  lfile->SetPreallocationBlockSize(preallocate_block_size);
1992
2351
 
1993
2352
  const auto& listeners = immutable_db_options_.listeners;
@@ -2001,9 +2360,15 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
2001
2360
  *new_log = new log::Writer(std::move(file_writer), log_file_num,
2002
2361
  immutable_db_options_.recycle_log_file_num > 0,
2003
2362
  immutable_db_options_.manual_wal_flush,
2004
- immutable_db_options_.wal_compression);
2363
+ immutable_db_options_.wal_compression,
2364
+ immutable_db_options_.track_and_verify_wals);
2005
2365
  io_s = (*new_log)->AddCompressionTypeRecord(write_options);
2366
+ if (io_s.ok()) {
2367
+ io_s = (*new_log)->MaybeAddPredecessorWALInfo(write_options,
2368
+ predecessor_wal_info);
2369
+ }
2006
2370
  }
2371
+
2007
2372
  return io_s;
2008
2373
  }
2009
2374
 
@@ -2014,9 +2379,10 @@ void DBImpl::TrackExistingDataFiles(
2014
2379
 
2015
2380
  Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2016
2381
  const std::vector<ColumnFamilyDescriptor>& column_families,
2017
- std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
2018
- const bool seq_per_batch, const bool batch_per_txn,
2019
- const bool is_retry, bool* can_retry) {
2382
+ std::vector<ColumnFamilyHandle*>* handles,
2383
+ std::unique_ptr<DB>* dbptr, const bool seq_per_batch,
2384
+ const bool batch_per_txn, const bool is_retry,
2385
+ bool* can_retry) {
2020
2386
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
2021
2387
  const ReadOptions read_options(Env::IOActivity::kDBOpen);
2022
2388
 
@@ -2035,15 +2401,17 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2035
2401
  handles->clear();
2036
2402
 
2037
2403
  size_t max_write_buffer_size = 0;
2404
+ MinAndMaxPreserveSeconds preserve_info;
2038
2405
  for (const auto& cf : column_families) {
2039
2406
  max_write_buffer_size =
2040
2407
  std::max(max_write_buffer_size, cf.options.write_buffer_size);
2408
+ preserve_info.Combine(cf.options);
2041
2409
  }
2042
2410
 
2043
- DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
2411
+ auto impl = std::make_unique<DBImpl>(db_options, dbname, seq_per_batch,
2412
+ batch_per_txn);
2044
2413
  if (!impl->immutable_db_options_.info_log) {
2045
2414
  s = impl->init_logger_creation_s_;
2046
- delete impl;
2047
2415
  return s;
2048
2416
  } else {
2049
2417
  assert(impl->init_logger_creation_s_.ok());
@@ -2076,7 +2444,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2076
2444
  s = impl->CreateArchivalDirectory();
2077
2445
  }
2078
2446
  if (!s.ok()) {
2079
- delete impl;
2080
2447
  return s;
2081
2448
  }
2082
2449
 
@@ -2096,23 +2463,29 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2096
2463
  log::Writer* new_log = nullptr;
2097
2464
  const size_t preallocate_block_size =
2098
2465
  impl->GetWalPreallocateBlockSize(max_write_buffer_size);
2466
+ // TODO(hx235): Pass in the correct `predecessor_wal_info` for the first WAL
2467
+ // created during DB open with predecessor WALs from previous DB session due
2468
+ // to `avoid_flush_during_recovery == true`. This can protect the last WAL
2469
+ // recovered.
2099
2470
  s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/,
2100
- preallocate_block_size, &new_log);
2471
+ preallocate_block_size,
2472
+ PredecessorWALInfo() /* predecessor_wal_info */,
2473
+ &new_log);
2101
2474
  if (s.ok()) {
2102
2475
  // Prevent log files created by previous instance from being recycled.
2103
2476
  // They might be in alive_log_file_, and might get recycled otherwise.
2104
- impl->min_log_number_to_recycle_ = new_log_number;
2477
+ impl->min_wal_number_to_recycle_ = new_log_number;
2105
2478
  }
2106
2479
  if (s.ok()) {
2107
- InstrumentedMutexLock wl(&impl->log_write_mutex_);
2108
- impl->logfile_number_ = new_log_number;
2480
+ InstrumentedMutexLock wl(&impl->wal_write_mutex_);
2481
+ impl->cur_wal_number_ = new_log_number;
2109
2482
  assert(new_log != nullptr);
2110
2483
  assert(impl->logs_.empty());
2111
2484
  impl->logs_.emplace_back(new_log_number, new_log);
2112
2485
  }
2113
2486
 
2114
2487
  if (s.ok()) {
2115
- impl->alive_log_files_.emplace_back(impl->logfile_number_);
2488
+ impl->alive_wal_files_.emplace_back(impl->cur_wal_number_);
2116
2489
  // In WritePrepared there could be gap in sequence numbers. This breaks
2117
2490
  // the trick we use in kPointInTimeRecovery which assumes the first seq in
2118
2491
  // the log right after the corrupted log is one larger than the last seq
@@ -2125,14 +2498,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2125
2498
  if (recovered_seq != kMaxSequenceNumber) {
2126
2499
  WriteBatch empty_batch;
2127
2500
  WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
2128
- uint64_t log_used, log_size;
2501
+ uint64_t wal_used, log_size;
2129
2502
  log::Writer* log_writer = impl->logs_.back().writer;
2130
- LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back();
2503
+ WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back();
2131
2504
 
2132
- assert(log_writer->get_log_number() == log_file_number_size.number);
2505
+ assert(log_writer->get_log_number() == wal_file_number_size.number);
2133
2506
  impl->mutex_.AssertHeld();
2134
- s = impl->WriteToWAL(empty_batch, write_options, log_writer, &log_used,
2135
- &log_size, log_file_number_size);
2507
+ s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used,
2508
+ &log_size, wal_file_number_size, recovered_seq);
2136
2509
  if (s.ok()) {
2137
2510
  // Need to fsync, otherwise it might get lost after a power reset.
2138
2511
  s = impl->FlushWAL(write_options, false);
@@ -2165,6 +2538,12 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2165
2538
  s = impl->InitPersistStatsColumnFamily();
2166
2539
  }
2167
2540
 
2541
+ // After reaching the post-recovery seqno but before creating SuperVersions
2542
+ // ensure seqno to time mapping is pre-populated as needed.
2543
+ if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) {
2544
+ impl->PrepopulateSeqnoToTimeMapping(preserve_info);
2545
+ }
2546
+
2168
2547
  if (s.ok()) {
2169
2548
  // set column family handles
2170
2549
  for (const auto& cf : column_families) {
@@ -2172,8 +2551,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2172
2551
  impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
2173
2552
  if (cfd != nullptr) {
2174
2553
  handles->push_back(
2175
- new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
2554
+ new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_));
2176
2555
  impl->NewThreadStatusCfInfo(cfd);
2556
+ SuperVersionContext sv_context(/* create_superversion */ true);
2557
+ impl->InstallSuperVersionForConfigChange(cfd, &sv_context);
2558
+ sv_context.Clean();
2177
2559
  } else {
2178
2560
  if (db_options.create_missing_column_families) {
2179
2561
  // missing column family, create it
@@ -2181,6 +2563,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2181
2563
  impl->mutex_.Unlock();
2182
2564
  // NOTE: the work normally done in WrapUpCreateColumnFamilies will
2183
2565
  // be done separately below.
2566
+ // This includes InstallSuperVersionForConfigChange.
2184
2567
  s = impl->CreateColumnFamilyImpl(read_options, write_options,
2185
2568
  cf.options, cf.name, &handle);
2186
2569
  impl->mutex_.Lock();
@@ -2197,16 +2580,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2197
2580
  }
2198
2581
  }
2199
2582
 
2200
- if (s.ok()) {
2583
+ if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
2584
+ // Install SuperVersion for hidden column family
2585
+ assert(impl->persist_stats_cf_handle_);
2586
+ assert(impl->persist_stats_cf_handle_->cfd());
2201
2587
  SuperVersionContext sv_context(/* create_superversion */ true);
2202
- for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
2203
- impl->InstallSuperVersionAndScheduleWork(
2204
- cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
2205
- }
2588
+ impl->InstallSuperVersionForConfigChange(
2589
+ impl->persist_stats_cf_handle_->cfd(), &sv_context);
2206
2590
  sv_context.Clean();
2207
- }
2208
-
2209
- if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
2210
2591
  // try to read format version
2211
2592
  s = impl->PersistentStatsProcessFormatVersion();
2212
2593
  }
@@ -2216,7 +2597,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2216
2597
  if (!cfd->mem()->IsSnapshotSupported()) {
2217
2598
  impl->is_snapshot_supported_ = false;
2218
2599
  }
2219
- if (cfd->ioptions()->merge_operator != nullptr &&
2600
+ if (cfd->ioptions().merge_operator != nullptr &&
2220
2601
  !cfd->mem()->IsMergeOperatorSupported()) {
2221
2602
  s = Status::InvalidArgument(
2222
2603
  "The memtable of column family %s does not support merge operator "
@@ -2235,7 +2616,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2235
2616
  // The WriteOptionsFile() will release and lock the mutex internally.
2236
2617
  persist_options_status =
2237
2618
  impl->WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
2238
- *dbptr = impl;
2239
2619
  impl->opened_successfully_ = true;
2240
2620
  } else {
2241
2621
  persist_options_status.PermitUncheckedError();
@@ -2286,7 +2666,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2286
2666
 
2287
2667
  if (s.ok()) {
2288
2668
  ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
2289
- impl);
2669
+ impl.get());
2290
2670
  LogFlush(impl->immutable_db_options_.info_log);
2291
2671
  if (!impl->WALBufferIsEmpty()) {
2292
2672
  s = impl->FlushWAL(write_options, false);
@@ -2316,17 +2696,16 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2316
2696
  s = impl->StartPeriodicTaskScheduler();
2317
2697
  }
2318
2698
  if (s.ok()) {
2319
- s = impl->RegisterRecordSeqnoTimeWorker(read_options, write_options,
2320
- recovery_ctx.is_new_db_);
2699
+ s = impl->RegisterRecordSeqnoTimeWorker();
2321
2700
  }
2322
2701
  impl->options_mutex_.Unlock();
2323
- if (!s.ok()) {
2702
+ if (s.ok()) {
2703
+ *dbptr = std::move(impl);
2704
+ } else {
2324
2705
  for (auto* h : *handles) {
2325
2706
  delete h;
2326
2707
  }
2327
2708
  handles->clear();
2328
- delete impl;
2329
- *dbptr = nullptr;
2330
2709
  }
2331
2710
  return s;
2332
2711
  }