@nxtedition/rocksdb 8.2.7 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -1
  2. package/deps/rocksdb/rocksdb/Makefile +22 -19
  3. package/deps/rocksdb/rocksdb/TARGETS +8 -0
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +157 -61
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +43 -92
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +632 -455
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +244 -149
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +41 -13
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +11 -1
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +216 -17
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +7 -5
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +279 -199
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +159 -8
  15. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +28 -2
  16. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +1 -1
  17. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +8 -0
  18. package/deps/rocksdb/rocksdb/crash_test.mk +14 -0
  19. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -1
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +1 -1
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +18 -21
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +1 -2
  26. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +1 -1
  27. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +2 -3
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +1 -1
  29. package/deps/rocksdb/rocksdb/db/builder.cc +32 -7
  30. package/deps/rocksdb/rocksdb/db/c.cc +169 -6
  31. package/deps/rocksdb/rocksdb/db/c_test.c +104 -6
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +98 -47
  33. package/deps/rocksdb/rocksdb/db/column_family.h +25 -2
  34. package/deps/rocksdb/rocksdb/db/column_family_test.cc +213 -2
  35. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +93 -23
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +33 -9
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -6
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +2 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +107 -43
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -4
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -0
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +25 -17
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -4
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -11
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +29 -4
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +24 -31
  50. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +3 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +19 -19
  52. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +2 -1
  53. package/deps/rocksdb/rocksdb/db/convenience.cc +20 -3
  54. package/deps/rocksdb/rocksdb/db/convenience_impl.h +15 -0
  55. package/deps/rocksdb/rocksdb/db/corruption_test.cc +17 -0
  56. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +1 -0
  57. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +17 -3
  58. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +5 -0
  59. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +15 -15
  60. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +666 -44
  61. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +2 -29
  62. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +274 -1
  63. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +40 -19
  64. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +6 -5
  65. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +250 -116
  66. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +51 -23
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +354 -96
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +6 -3
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +2 -1
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +50 -21
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +26 -13
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +13 -5
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +61 -21
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -87
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +7 -1
  77. package/deps/rocksdb/rocksdb/db/db_iter.cc +2 -2
  78. package/deps/rocksdb/rocksdb/db/db_iter.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +4 -11
  80. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +6 -6
  81. package/deps/rocksdb/rocksdb/db/db_options_test.cc +39 -29
  82. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +26 -36
  83. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +106 -0
  84. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +12 -3
  85. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +1 -1
  86. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +1 -0
  87. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +279 -166
  88. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -21
  89. package/deps/rocksdb/rocksdb/db/db_test2.cc +81 -12
  90. package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
  91. package/deps/rocksdb/rocksdb/db/db_test_util.h +40 -0
  92. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +13 -1
  93. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +233 -0
  94. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +143 -0
  95. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +6 -6
  96. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -2
  97. package/deps/rocksdb/rocksdb/db/dbformat.cc +36 -0
  98. package/deps/rocksdb/rocksdb/db/dbformat.h +169 -20
  99. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +129 -0
  100. package/deps/rocksdb/rocksdb/db/error_handler.cc +16 -0
  101. package/deps/rocksdb/rocksdb/db/error_handler.h +6 -3
  102. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  103. package/deps/rocksdb/rocksdb/db/event_helpers.cc +4 -0
  104. package/deps/rocksdb/rocksdb/db/experimental.cc +2 -1
  105. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +4 -4
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +17 -8
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -4
  108. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -4
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +101 -11
  111. package/deps/rocksdb/rocksdb/db/flush_job.h +24 -1
  112. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +88 -11
  113. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +2 -3
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +159 -91
  115. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +19 -10
  116. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +143 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -1
  118. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  119. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -1
  120. package/deps/rocksdb/rocksdb/db/log_reader.h +3 -2
  121. package/deps/rocksdb/rocksdb/db/log_test.cc +17 -21
  122. package/deps/rocksdb/rocksdb/db/log_writer.cc +1 -1
  123. package/deps/rocksdb/rocksdb/db/log_writer.h +3 -2
  124. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +4 -3
  125. package/deps/rocksdb/rocksdb/db/memtable.cc +52 -13
  126. package/deps/rocksdb/rocksdb/db/memtable.h +45 -1
  127. package/deps/rocksdb/rocksdb/db/memtable_list.cc +44 -10
  128. package/deps/rocksdb/rocksdb/db/memtable_list.h +32 -1
  129. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +90 -4
  130. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -2
  131. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +1 -0
  132. package/deps/rocksdb/rocksdb/db/repair.cc +21 -4
  133. package/deps/rocksdb/rocksdb/db/repair_test.cc +143 -2
  134. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -4
  135. package/deps/rocksdb/rocksdb/db/table_cache.cc +44 -35
  136. package/deps/rocksdb/rocksdb/db/table_cache.h +6 -6
  137. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  138. package/deps/rocksdb/rocksdb/db/version_builder.cc +0 -1
  139. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +236 -204
  140. package/deps/rocksdb/rocksdb/db/version_edit.cc +66 -4
  141. package/deps/rocksdb/rocksdb/db/version_edit.h +48 -6
  142. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +80 -8
  143. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +12 -0
  144. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +86 -17
  145. package/deps/rocksdb/rocksdb/db/version_set.cc +136 -41
  146. package/deps/rocksdb/rocksdb/db/version_set.h +28 -7
  147. package/deps/rocksdb/rocksdb/db/version_set_test.cc +25 -15
  148. package/deps/rocksdb/rocksdb/db/write_batch.cc +11 -0
  149. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +3 -0
  150. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +16 -0
  151. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -3
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +2 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +42 -0
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +32 -3
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -0
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +247 -120
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +9 -4
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +13 -6
  159. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +2 -0
  160. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +15 -27
  161. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +264 -69
  162. package/deps/rocksdb/rocksdb/env/env.cc +1 -2
  163. package/deps/rocksdb/rocksdb/env/env_encryption.cc +11 -165
  164. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +0 -17
  165. package/deps/rocksdb/rocksdb/env/env_posix.cc +6 -2
  166. package/deps/rocksdb/rocksdb/env/env_test.cc +86 -2
  167. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  168. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +78 -0
  169. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +34 -0
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -0
  171. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +15 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +52 -43
  173. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +34 -18
  174. package/deps/rocksdb/rocksdb/file/file_util.cc +10 -5
  175. package/deps/rocksdb/rocksdb/file/file_util.h +13 -1
  176. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +724 -79
  177. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +64 -33
  178. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -16
  179. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +23 -12
  180. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +3 -0
  181. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +2 -1
  182. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +153 -88
  183. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +70 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +50 -11
  185. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -0
  186. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +16 -2
  187. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +1 -1
  188. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +55 -8
  189. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +32 -4
  190. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -109
  191. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +90 -13
  192. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +3 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +85 -17
  194. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +13 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +2 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +5 -1
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +21 -2
  198. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +7 -1
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +6 -0
  200. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +5 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +33 -2
  202. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +14 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +33 -2
  204. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +0 -3
  205. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  206. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +3 -0
  207. package/deps/rocksdb/rocksdb/memory/arena_test.cc +18 -11
  208. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +2 -1
  209. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +69 -34
  210. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +16 -1
  211. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +10 -0
  212. package/deps/rocksdb/rocksdb/options/cf_options.cc +19 -0
  213. package/deps/rocksdb/rocksdb/options/cf_options.h +10 -2
  214. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -1
  215. package/deps/rocksdb/rocksdb/options/db_options.cc +7 -0
  216. package/deps/rocksdb/rocksdb/options/db_options.h +1 -0
  217. package/deps/rocksdb/rocksdb/options/options.cc +15 -1
  218. package/deps/rocksdb/rocksdb/options/options_helper.cc +6 -0
  219. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -3
  220. package/deps/rocksdb/rocksdb/options/options_test.cc +8 -0
  221. package/deps/rocksdb/rocksdb/port/mmap.h +20 -0
  222. package/deps/rocksdb/rocksdb/port/stack_trace.cc +27 -12
  223. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -1
  224. package/deps/rocksdb/rocksdb/src.mk +3 -0
  225. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  226. package/deps/rocksdb/rocksdb/table/block_based/block.cc +48 -22
  227. package/deps/rocksdb/rocksdb/table/block_based/block.h +60 -12
  228. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +115 -42
  229. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -5
  230. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +60 -2
  231. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +2 -0
  232. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +62 -44
  233. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +36 -14
  234. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -15
  235. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +219 -51
  236. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +41 -8
  237. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -1
  238. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +50 -21
  239. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +11 -4
  240. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +195 -55
  241. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +31 -16
  243. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +97 -58
  244. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  245. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +6 -0
  246. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +27 -12
  247. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +3 -1
  248. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +114 -70
  249. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +1 -2
  250. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +9 -6
  251. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +15 -3
  252. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +6 -3
  253. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +11 -11
  254. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -0
  255. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +1 -0
  256. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +6 -2
  257. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +1 -2
  258. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +2 -3
  259. package/deps/rocksdb/rocksdb/table/format.cc +175 -33
  260. package/deps/rocksdb/rocksdb/table/format.h +63 -10
  261. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +10 -2
  262. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +12 -4
  263. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -0
  264. package/deps/rocksdb/rocksdb/table/mock_table.cc +8 -3
  265. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +10 -5
  266. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +10 -1
  267. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +1 -2
  268. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -3
  269. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +12 -3
  270. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +26 -1
  271. package/deps/rocksdb/rocksdb/table/table_builder.h +6 -2
  272. package/deps/rocksdb/rocksdb/table/table_properties.cc +6 -0
  273. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -22
  274. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +19 -7
  275. package/deps/rocksdb/rocksdb/test_util/sync_point.h +3 -1
  276. package/deps/rocksdb/rocksdb/test_util/testutil.cc +29 -0
  277. package/deps/rocksdb/rocksdb/test_util/testutil.h +19 -0
  278. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +65 -26
  279. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
  280. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -0
  281. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -0
  282. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +0 -1
  283. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +4 -0
  284. package/deps/rocksdb/rocksdb/unreleased_history/README.txt +73 -0
  285. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +27 -0
  286. package/deps/rocksdb/rocksdb/unreleased_history/behavior_changes/.gitkeep +0 -0
  287. package/deps/rocksdb/rocksdb/unreleased_history/bug_fixes/.gitkeep +0 -0
  288. package/deps/rocksdb/rocksdb/unreleased_history/new_features/.gitkeep +0 -0
  289. package/deps/rocksdb/rocksdb/unreleased_history/performance_improvements/.gitkeep +0 -0
  290. package/deps/rocksdb/rocksdb/unreleased_history/public_api_changes/.gitkeep +0 -0
  291. package/deps/rocksdb/rocksdb/unreleased_history/release.sh +104 -0
  292. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +5 -0
  293. package/deps/rocksdb/rocksdb/util/bloom_impl.h +3 -3
  294. package/deps/rocksdb/rocksdb/util/cast_util.h +14 -0
  295. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -0
  296. package/deps/rocksdb/rocksdb/util/comparator.cc +29 -7
  297. package/deps/rocksdb/rocksdb/util/compression.cc +4 -4
  298. package/deps/rocksdb/rocksdb/util/compression.h +110 -32
  299. package/deps/rocksdb/rocksdb/util/core_local.h +2 -1
  300. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +4 -4
  301. package/deps/rocksdb/rocksdb/util/filelock_test.cc +3 -0
  302. package/deps/rocksdb/rocksdb/util/hash.h +7 -3
  303. package/deps/rocksdb/rocksdb/util/hash_test.cc +44 -0
  304. package/deps/rocksdb/rocksdb/util/math.h +58 -6
  305. package/deps/rocksdb/rocksdb/util/math128.h +29 -7
  306. package/deps/rocksdb/rocksdb/util/mutexlock.h +35 -27
  307. package/deps/rocksdb/rocksdb/util/single_thread_executor.h +1 -0
  308. package/deps/rocksdb/rocksdb/util/stop_watch.h +1 -1
  309. package/deps/rocksdb/rocksdb/util/thread_operation.h +8 -1
  310. package/deps/rocksdb/rocksdb/util/udt_util.cc +343 -0
  311. package/deps/rocksdb/rocksdb/util/udt_util.h +173 -1
  312. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +447 -0
  313. package/deps/rocksdb/rocksdb/util/write_batch_util.cc +25 -0
  314. package/deps/rocksdb/rocksdb/util/write_batch_util.h +80 -0
  315. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -4
  316. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +69 -25
  317. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +7 -6
  318. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +1 -1
  319. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +2 -3
  320. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +6 -11
  321. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -2
  322. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +4 -5
  323. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +1 -1
  324. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +2 -2
  325. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +2 -1
  326. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +1 -2
  328. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +2 -3
  329. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +2 -2
  330. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  331. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +23 -8
  332. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +9 -6
  333. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +37 -12
  334. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +231 -33
  335. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +0 -1
  336. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +76 -20
  337. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +18 -9
  338. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +40 -23
  339. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +13 -12
  340. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +7 -0
  341. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -1
  342. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +41 -11
  343. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +6 -3
  344. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +71 -24
  345. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +19 -4
  346. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +60 -107
  347. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +39 -11
  348. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +6 -3
  349. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +14 -8
  350. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h +1 -1
  351. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +10 -5
  352. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  353. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +1 -1
  354. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +2 -1
  355. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +6 -6
  356. package/deps/rocksdb/rocksdb.gyp +2 -0
  357. package/package.json +1 -1
  358. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  359. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -12,10 +12,13 @@
12
12
 
13
13
  #include <atomic>
14
14
  #include <functional>
15
+ #include <memory>
15
16
  #include <mutex>
16
17
  #include <thread>
17
18
 
18
19
  #include "port/port.h"
20
+ #include "util/fastrange.h"
21
+ #include "util/hash.h"
19
22
 
20
23
  namespace ROCKSDB_NAMESPACE {
21
24
 
@@ -129,10 +132,25 @@ class SpinMutex {
129
132
  std::atomic<bool> locked_;
130
133
  };
131
134
 
132
- // We want to prevent false sharing
135
+ // For preventing false sharing, especially for mutexes.
136
+ // NOTE: if a mutex is less than half the size of a cache line, it would
137
+ // make more sense for Striped structure below to pack more than one mutex
138
+ // into each cache line, as this would only reduce contention for the same
139
+ // amount of space and cache sharing. However, a mutex is often 40 bytes out
140
+ // of a 64 byte cache line.
133
141
  template <class T>
134
- struct ALIGN_AS(CACHE_LINE_SIZE) LockData {
135
- T lock_;
142
+ struct ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedWrapper {
143
+ T obj_;
144
+ };
145
+ template <class T>
146
+ struct Unwrap {
147
+ using type = T;
148
+ static type &Go(T &t) { return t; }
149
+ };
150
+ template <class T>
151
+ struct Unwrap<CacheAlignedWrapper<T>> {
152
+ using type = T;
153
+ static type &Go(CacheAlignedWrapper<T> &t) { return t.obj_; }
136
154
  };
137
155
 
138
156
  //
@@ -144,38 +162,28 @@ struct ALIGN_AS(CACHE_LINE_SIZE) LockData {
144
162
  // single lock and allowing independent operations to lock different stripes and
145
163
  // proceed concurrently, instead of creating contention for a single lock.
146
164
  //
147
- template <class T, class P>
165
+ template <class T, class Key = Slice, class Hash = SliceNPHasher64>
148
166
  class Striped {
149
167
  public:
150
- Striped(size_t stripes, std::function<uint64_t(const P &)> hash)
151
- : stripes_(stripes), hash_(hash) {
152
- locks_ = reinterpret_cast<LockData<T> *>(
153
- port::cacheline_aligned_alloc(sizeof(LockData<T>) * stripes));
154
- for (size_t i = 0; i < stripes; i++) {
155
- new (&locks_[i]) LockData<T>();
156
- }
157
- }
168
+ explicit Striped(size_t stripe_count)
169
+ : stripe_count_(stripe_count), data_(new T[stripe_count]) {}
158
170
 
159
- virtual ~Striped() {
160
- if (locks_ != nullptr) {
161
- assert(stripes_ > 0);
162
- for (size_t i = 0; i < stripes_; i++) {
163
- locks_[i].~LockData<T>();
164
- }
165
- port::cacheline_aligned_free(locks_);
166
- }
171
+ using Unwrapped = typename Unwrap<T>::type;
172
+ Unwrapped &Get(const Key &key, uint64_t seed = 0) {
173
+ size_t index = FastRangeGeneric(hash_(key, seed), stripe_count_);
174
+ return Unwrap<T>::Go(data_[index]);
167
175
  }
168
176
 
169
- T *get(const P &key) {
170
- uint64_t h = hash_(key);
171
- size_t index = h % stripes_;
172
- return &reinterpret_cast<LockData<T> *>(&locks_[index])->lock_;
177
+ size_t ApproximateMemoryUsage() const {
178
+ // NOTE: could use malloc_usable_size() here, but that could count unmapped
179
+ // pages and could mess up unit test OccLockBucketsTest::CacheAligned
180
+ return sizeof(*this) + stripe_count_ * sizeof(T);
173
181
  }
174
182
 
175
183
  private:
176
- size_t stripes_;
177
- LockData<T> *locks_;
178
- std::function<uint64_t(const P &)> hash_;
184
+ size_t stripe_count_;
185
+ std::unique_ptr<T[]> data_;
186
+ Hash hash_;
179
187
  };
180
188
 
181
189
  } // namespace ROCKSDB_NAMESPACE
@@ -8,6 +8,7 @@
8
8
 
9
9
  #if USE_COROUTINES
10
10
  #include <atomic>
11
+ #include <queue>
11
12
 
12
13
  #include "folly/CPortability.h"
13
14
  #include "folly/CppAttributes.h"
@@ -32,7 +32,7 @@ class StopWatch {
32
32
  elapsed_(elapsed),
33
33
  overwrite_(overwrite),
34
34
  stats_enabled_(statistics &&
35
- statistics->get_stats_level() >=
35
+ statistics->get_stats_level() >
36
36
  StatsLevel::kExceptTimers &&
37
37
  (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX ||
38
38
  hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX)),
@@ -39,7 +39,14 @@ static OperationInfo global_operation_table[] = {
39
39
  {ThreadStatus::OP_UNKNOWN, ""},
40
40
  {ThreadStatus::OP_COMPACTION, "Compaction"},
41
41
  {ThreadStatus::OP_FLUSH, "Flush"},
42
- {ThreadStatus::OP_DBOPEN, "DBOpen"}};
42
+ {ThreadStatus::OP_DBOPEN, "DBOpen"},
43
+ {ThreadStatus::OP_GET, "Get"},
44
+ {ThreadStatus::OP_MULTIGET, "MultiGet"},
45
+ {ThreadStatus::OP_DBITERATOR, "DBIterator"},
46
+ {ThreadStatus::OP_VERIFY_DB_CHECKSUM, "VerifyDBChecksum"},
47
+ {ThreadStatus::OP_VERIFY_FILE_CHECKSUMS, "VerifyFileChecksums"},
48
+
49
+ };
43
50
 
44
51
  struct OperationStageInfo {
45
52
  const ThreadStatus::OperationStage stage;
@@ -0,0 +1,343 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ //
3
+ // This source code is licensed under both the GPLv2 (found in the
4
+ // COPYING file in the root directory) and Apache 2.0 License
5
+ // (found in the LICENSE.Apache file in the root directory).
6
+
7
+ #include "util/udt_util.h"
8
+
9
+ #include "db/dbformat.h"
10
+ #include "rocksdb/types.h"
11
+ #include "util/write_batch_util.h"
12
+
13
+ namespace ROCKSDB_NAMESPACE {
14
+ namespace {
15
+ enum class RecoveryType {
16
+ kNoop,
17
+ kUnrecoverable,
18
+ kStripTimestamp,
19
+ kPadTimestamp,
20
+ };
21
+
22
+ RecoveryType GetRecoveryType(const size_t running_ts_sz,
23
+ const std::optional<size_t>& recorded_ts_sz) {
24
+ if (running_ts_sz == 0) {
25
+ if (!recorded_ts_sz.has_value()) {
26
+ // A column family id not recorded is equivalent to that column family has
27
+ // zero timestamp size.
28
+ return RecoveryType::kNoop;
29
+ }
30
+ return RecoveryType::kStripTimestamp;
31
+ }
32
+
33
+ assert(running_ts_sz != 0);
34
+
35
+ if (!recorded_ts_sz.has_value()) {
36
+ return RecoveryType::kPadTimestamp;
37
+ }
38
+
39
+ if (running_ts_sz != *recorded_ts_sz) {
40
+ return RecoveryType::kUnrecoverable;
41
+ }
42
+
43
+ return RecoveryType::kNoop;
44
+ }
45
+
46
+ bool AllRunningColumnFamiliesConsistent(
47
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz,
48
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz) {
49
+ for (const auto& [cf_id, ts_sz] : running_ts_sz) {
50
+ auto record_it = record_ts_sz.find(cf_id);
51
+ RecoveryType recovery_type =
52
+ GetRecoveryType(ts_sz, record_it != record_ts_sz.end()
53
+ ? std::optional<size_t>(record_it->second)
54
+ : std::nullopt);
55
+ if (recovery_type != RecoveryType::kNoop) {
56
+ return false;
57
+ }
58
+ }
59
+ return true;
60
+ }
61
+
62
+ Status CheckWriteBatchTimestampSizeConsistency(
63
+ const WriteBatch* batch,
64
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz,
65
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz,
66
+ TimestampSizeConsistencyMode check_mode, bool* ts_need_recovery) {
67
+ std::vector<uint32_t> column_family_ids;
68
+ Status status =
69
+ CollectColumnFamilyIdsFromWriteBatch(*batch, &column_family_ids);
70
+ if (!status.ok()) {
71
+ return status;
72
+ }
73
+ for (const auto& cf_id : column_family_ids) {
74
+ auto running_iter = running_ts_sz.find(cf_id);
75
+ if (running_iter == running_ts_sz.end()) {
76
+ // Ignore dropped column family referred to in a WriteBatch regardless of
77
+ // its consistency.
78
+ continue;
79
+ }
80
+ auto record_iter = record_ts_sz.find(cf_id);
81
+ RecoveryType recovery_type = GetRecoveryType(
82
+ running_iter->second, record_iter != record_ts_sz.end()
83
+ ? std::optional<size_t>(record_iter->second)
84
+ : std::nullopt);
85
+ if (recovery_type != RecoveryType::kNoop) {
86
+ if (check_mode == TimestampSizeConsistencyMode::kVerifyConsistency) {
87
+ return Status::InvalidArgument(
88
+ "WriteBatch contains timestamp size inconsistency.");
89
+ }
90
+
91
+ if (recovery_type == RecoveryType::kUnrecoverable) {
92
+ return Status::InvalidArgument(
93
+ "WriteBatch contains unrecoverable timestamp size inconsistency.");
94
+ }
95
+
96
+ // If any column family needs reconciliation, it will mark the whole
97
+ // WriteBatch to need recovery and rebuilt.
98
+ *ts_need_recovery = true;
99
+ }
100
+ }
101
+ return Status::OK();
102
+ }
103
+
104
+ enum class ToggleUDT {
105
+ kUnchanged,
106
+ kEnableUDT,
107
+ kDisableUDT,
108
+ kInvalidChange,
109
+ };
110
+
111
+ ToggleUDT CompareComparator(const Comparator* new_comparator,
112
+ const std::string& old_comparator_name) {
113
+ static const char* kUDTSuffix = ".u64ts";
114
+ static const Slice kSuffixSlice = kUDTSuffix;
115
+ static const size_t kSuffixSize = 6;
116
+ size_t ts_sz = new_comparator->timestamp_size();
117
+ (void)ts_sz;
118
+ Slice new_ucmp_name(new_comparator->Name());
119
+ Slice old_ucmp_name(old_comparator_name);
120
+ if (new_ucmp_name.compare(old_ucmp_name) == 0) {
121
+ return ToggleUDT::kUnchanged;
122
+ }
123
+ if (new_ucmp_name.size() == old_ucmp_name.size() + kSuffixSize &&
124
+ new_ucmp_name.starts_with(old_ucmp_name) &&
125
+ new_ucmp_name.ends_with(kSuffixSlice)) {
126
+ assert(ts_sz == 8);
127
+ return ToggleUDT::kEnableUDT;
128
+ }
129
+ if (old_ucmp_name.size() == new_ucmp_name.size() + kSuffixSize &&
130
+ old_ucmp_name.starts_with(new_ucmp_name) &&
131
+ old_ucmp_name.ends_with(kSuffixSlice)) {
132
+ assert(ts_sz == 0);
133
+ return ToggleUDT::kDisableUDT;
134
+ }
135
+ return ToggleUDT::kInvalidChange;
136
+ }
137
+ } // namespace
138
+
139
+ TimestampRecoveryHandler::TimestampRecoveryHandler(
140
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz,
141
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz)
142
+ : running_ts_sz_(running_ts_sz),
143
+ record_ts_sz_(record_ts_sz),
144
+ new_batch_(new WriteBatch()),
145
+ handler_valid_(true),
146
+ new_batch_diff_from_orig_batch_(false) {}
147
+
148
+ Status TimestampRecoveryHandler::PutCF(uint32_t cf, const Slice& key,
149
+ const Slice& value) {
150
+ std::string new_key_buf;
151
+ Slice new_key;
152
+ Status status =
153
+ ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
154
+ if (!status.ok()) {
155
+ return status;
156
+ }
157
+ return WriteBatchInternal::Put(new_batch_.get(), cf, new_key, value);
158
+ }
159
+
160
+ Status TimestampRecoveryHandler::DeleteCF(uint32_t cf, const Slice& key) {
161
+ std::string new_key_buf;
162
+ Slice new_key;
163
+ Status status =
164
+ ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
165
+ if (!status.ok()) {
166
+ return status;
167
+ }
168
+ return WriteBatchInternal::Delete(new_batch_.get(), cf, new_key);
169
+ }
170
+
171
+ Status TimestampRecoveryHandler::SingleDeleteCF(uint32_t cf, const Slice& key) {
172
+ std::string new_key_buf;
173
+ Slice new_key;
174
+ Status status =
175
+ ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
176
+ if (!status.ok()) {
177
+ return status;
178
+ }
179
+ return WriteBatchInternal::SingleDelete(new_batch_.get(), cf, new_key);
180
+ }
181
+
182
+ Status TimestampRecoveryHandler::DeleteRangeCF(uint32_t cf,
183
+ const Slice& begin_key,
184
+ const Slice& end_key) {
185
+ std::string new_begin_key_buf;
186
+ Slice new_begin_key;
187
+ std::string new_end_key_buf;
188
+ Slice new_end_key;
189
+ Status status = ReconcileTimestampDiscrepancy(
190
+ cf, begin_key, &new_begin_key_buf, &new_begin_key);
191
+ if (!status.ok()) {
192
+ return status;
193
+ }
194
+ status = ReconcileTimestampDiscrepancy(cf, end_key, &new_end_key_buf,
195
+ &new_end_key);
196
+ if (!status.ok()) {
197
+ return status;
198
+ }
199
+ return WriteBatchInternal::DeleteRange(new_batch_.get(), cf, new_begin_key,
200
+ new_end_key);
201
+ }
202
+
203
+ Status TimestampRecoveryHandler::MergeCF(uint32_t cf, const Slice& key,
204
+ const Slice& value) {
205
+ std::string new_key_buf;
206
+ Slice new_key;
207
+ Status status =
208
+ ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
209
+ if (!status.ok()) {
210
+ return status;
211
+ }
212
+ return WriteBatchInternal::Merge(new_batch_.get(), cf, new_key, value);
213
+ }
214
+
215
+ Status TimestampRecoveryHandler::PutBlobIndexCF(uint32_t cf, const Slice& key,
216
+ const Slice& value) {
217
+ std::string new_key_buf;
218
+ Slice new_key;
219
+ Status status =
220
+ ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
221
+ if (!status.ok()) {
222
+ return status;
223
+ }
224
+ return WriteBatchInternal::PutBlobIndex(new_batch_.get(), cf, new_key, value);
225
+ }
226
+
227
+ Status TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
228
+ uint32_t cf, const Slice& key, std::string* new_key_buf, Slice* new_key) {
229
+ assert(handler_valid_);
230
+ auto running_iter = running_ts_sz_.find(cf);
231
+ if (running_iter == running_ts_sz_.end()) {
232
+ // The column family referred to by the WriteBatch is no longer running.
233
+ // Copy over the entry as is to the new WriteBatch.
234
+ *new_key = key;
235
+ return Status::OK();
236
+ }
237
+ size_t running_ts_sz = running_iter->second;
238
+ auto record_iter = record_ts_sz_.find(cf);
239
+ std::optional<size_t> record_ts_sz =
240
+ record_iter != record_ts_sz_.end()
241
+ ? std::optional<size_t>(record_iter->second)
242
+ : std::nullopt;
243
+ RecoveryType recovery_type = GetRecoveryType(running_ts_sz, record_ts_sz);
244
+
245
+ switch (recovery_type) {
246
+ case RecoveryType::kNoop:
247
+ *new_key = key;
248
+ break;
249
+ case RecoveryType::kStripTimestamp:
250
+ assert(record_ts_sz.has_value());
251
+ *new_key = StripTimestampFromUserKey(key, *record_ts_sz);
252
+ new_batch_diff_from_orig_batch_ = true;
253
+ break;
254
+ case RecoveryType::kPadTimestamp:
255
+ AppendKeyWithMinTimestamp(new_key_buf, key, running_ts_sz);
256
+ *new_key = *new_key_buf;
257
+ new_batch_diff_from_orig_batch_ = true;
258
+ break;
259
+ case RecoveryType::kUnrecoverable:
260
+ return Status::InvalidArgument(
261
+ "Unrecoverable timestamp size inconsistency encountered by "
262
+ "TimestampRecoveryHandler.");
263
+ default:
264
+ assert(false);
265
+ }
266
+ return Status::OK();
267
+ }
268
+
269
+ Status HandleWriteBatchTimestampSizeDifference(
270
+ const WriteBatch* batch,
271
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz,
272
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz,
273
+ TimestampSizeConsistencyMode check_mode,
274
+ std::unique_ptr<WriteBatch>* new_batch) {
275
+ // Quick path to bypass checking the WriteBatch.
276
+ if (AllRunningColumnFamiliesConsistent(running_ts_sz, record_ts_sz)) {
277
+ return Status::OK();
278
+ }
279
+ bool need_recovery = false;
280
+ Status status = CheckWriteBatchTimestampSizeConsistency(
281
+ batch, running_ts_sz, record_ts_sz, check_mode, &need_recovery);
282
+ if (!status.ok()) {
283
+ return status;
284
+ } else if (need_recovery) {
285
+ assert(new_batch);
286
+ SequenceNumber sequence = WriteBatchInternal::Sequence(batch);
287
+ TimestampRecoveryHandler recovery_handler(running_ts_sz, record_ts_sz);
288
+ status = batch->Iterate(&recovery_handler);
289
+ if (!status.ok()) {
290
+ return status;
291
+ } else {
292
+ *new_batch = recovery_handler.TransferNewBatch();
293
+ WriteBatchInternal::SetSequence(new_batch->get(), sequence);
294
+ }
295
+ }
296
+ return Status::OK();
297
+ }
298
+
299
+ Status ValidateUserDefinedTimestampsOptions(
300
+ const Comparator* new_comparator, const std::string& old_comparator_name,
301
+ bool new_persist_udt, bool old_persist_udt,
302
+ bool* mark_sst_files_has_no_udt) {
303
+ size_t ts_sz = new_comparator->timestamp_size();
304
+ ToggleUDT res = CompareComparator(new_comparator, old_comparator_name);
305
+ switch (res) {
306
+ case ToggleUDT::kUnchanged:
307
+ if (old_persist_udt == new_persist_udt) {
308
+ return Status::OK();
309
+ }
310
+ if (ts_sz == 0) {
311
+ return Status::OK();
312
+ }
313
+ return Status::InvalidArgument(
314
+ "Cannot toggle the persist_user_defined_timestamps flag for a column "
315
+ "family with user-defined timestamps feature enabled.");
316
+ case ToggleUDT::kEnableUDT:
317
+ if (!new_persist_udt) {
318
+ *mark_sst_files_has_no_udt = true;
319
+ return Status::OK();
320
+ }
321
+ return Status::InvalidArgument(
322
+ "Cannot open a column family and enable user-defined timestamps "
323
+ "feature without setting persist_user_defined_timestamps flag to "
324
+ "false.");
325
+ case ToggleUDT::kDisableUDT:
326
+ if (!old_persist_udt) {
327
+ return Status::OK();
328
+ }
329
+ return Status::InvalidArgument(
330
+ "Cannot open a column family and disable user-defined timestamps "
331
+ "feature if its existing persist_user_defined_timestamps flag is not "
332
+ "false.");
333
+ case ToggleUDT::kInvalidChange:
334
+ return Status::InvalidArgument(
335
+ new_comparator->Name(),
336
+ "does not match existing comparator " + old_comparator_name);
337
+ default:
338
+ break;
339
+ }
340
+ return Status::InvalidArgument(
341
+ "Unsupported user defined timestamps settings change.");
342
+ }
343
+ } // namespace ROCKSDB_NAMESPACE
@@ -1,15 +1,22 @@
1
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ //
2
3
  // This source code is licensed under both the GPLv2 (found in the
3
4
  // COPYING file in the root directory) and Apache 2.0 License
4
5
  // (found in the LICENSE.Apache file in the root directory).
5
6
 
6
7
  #pragma once
8
+ #include <memory>
9
+ #include <optional>
7
10
  #include <sstream>
11
+ #include <unordered_map>
8
12
  #include <vector>
9
13
 
14
+ #include "db/write_batch_internal.h"
10
15
  #include "rocksdb/slice.h"
11
16
  #include "rocksdb/status.h"
17
+ #include "rocksdb/write_batch.h"
12
18
  #include "util/coding.h"
19
+ #include "util/hash_containers.h"
13
20
 
14
21
  namespace ROCKSDB_NAMESPACE {
15
22
 
@@ -74,4 +81,169 @@ class UserDefinedTimestampSizeRecord {
74
81
  std::vector<std::pair<uint32_t, size_t>> cf_to_ts_sz_;
75
82
  };
76
83
 
84
+ // This handler is used to recover a WriteBatch read from WAL logs during
85
+ // recovery. It does a best-effort recovery if the column families contained in
86
+ // the WriteBatch have inconsistency between the recorded timestamp size and the
87
+ // running timestamp size. And creates a new WriteBatch that are consistent with
88
+ // the running timestamp size with entries from the original WriteBatch.
89
+ //
90
+ // Note that for a WriteBatch with no inconsistency, a new WriteBatch is created
91
+ // nonetheless, and it should be exactly the same as the original WriteBatch.
92
+ //
93
+ // To access the new WriteBatch, invoke `TransferNewBatch` after calling
94
+ // `Iterate`. The handler becomes invalid afterwards.
95
+ //
96
+ // For the user key in each entry, the best effort recovery means:
97
+ // 1) If recorded timestamp size is 0, running timestamp size is > 0, a min
98
+ // timestamp of length running timestamp size is padded to the user key.
99
+ // 2) If recorded timestamp size is > 0, running timestamp size is 0, the last
100
+ // bytes of length recorded timestamp size is stripped from user key.
101
+ // 3) If recorded timestamp size is the same as running timestamp size, no-op.
102
+ // 4) If recorded timestamp size and running timestamp size are both non-zero
103
+ // but not equal, return Status::InvalidArgument.
104
+ class TimestampRecoveryHandler : public WriteBatch::Handler {
105
+ public:
106
+ TimestampRecoveryHandler(const UnorderedMap<uint32_t, size_t>& running_ts_sz,
107
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz);
108
+
109
+ ~TimestampRecoveryHandler() override {}
110
+
111
+ // No copy or move.
112
+ TimestampRecoveryHandler(const TimestampRecoveryHandler&) = delete;
113
+ TimestampRecoveryHandler(TimestampRecoveryHandler&&) = delete;
114
+ TimestampRecoveryHandler& operator=(const TimestampRecoveryHandler&) = delete;
115
+ TimestampRecoveryHandler& operator=(TimestampRecoveryHandler&&) = delete;
116
+
117
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override;
118
+
119
+ Status DeleteCF(uint32_t cf, const Slice& key) override;
120
+
121
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override;
122
+
123
+ Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
124
+ const Slice& end_key) override;
125
+
126
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override;
127
+
128
+ Status PutBlobIndexCF(uint32_t cf, const Slice& key,
129
+ const Slice& value) override;
130
+
131
+ Status MarkBeginPrepare(bool) override { return Status::OK(); }
132
+
133
+ Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
134
+
135
+ Status MarkCommit(const Slice&) override { return Status::OK(); }
136
+
137
+ Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
138
+ return Status::OK();
139
+ }
140
+
141
+ Status MarkRollback(const Slice&) override { return Status::OK(); }
142
+
143
+ Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
144
+
145
+ std::unique_ptr<WriteBatch>&& TransferNewBatch() {
146
+ assert(new_batch_diff_from_orig_batch_);
147
+ handler_valid_ = false;
148
+ return std::move(new_batch_);
149
+ }
150
+
151
+ private:
152
+ Status ReconcileTimestampDiscrepancy(uint32_t cf, const Slice& key,
153
+ std::string* new_key_buf,
154
+ Slice* new_key);
155
+
156
+ // Mapping from column family id to user-defined timestamp size for all
157
+ // running column families including the ones with zero timestamp size.
158
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz_;
159
+
160
+ // Mapping from column family id to user-defined timestamp size as recorded
161
+ // in the WAL. This only contains non-zero user-defined timestamp size.
162
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz_;
163
+
164
+ std::unique_ptr<WriteBatch> new_batch_;
165
+ // Handler is valid upon creation and becomes invalid after its `new_batch_`
166
+ // is transferred.
167
+ bool handler_valid_;
168
+
169
+ // False upon creation, and become true if at least one user key from the
170
+ // original batch is updated when creating the new batch.
171
+ bool new_batch_diff_from_orig_batch_;
172
+ };
173
+
174
+ // Mode for checking and handling timestamp size inconsistency encountered in a
175
+ // WriteBatch read from WAL log.
176
+ enum class TimestampSizeConsistencyMode {
177
+ // Verified that the recorded user-defined timestamp size is consistent with
178
+ // the running one for all the column families involved in a WriteBatch.
179
+ // Column families referred to in the WriteBatch but are dropped are ignored.
180
+ kVerifyConsistency,
181
+ // Verified that if any inconsistency exists in a WriteBatch, it's all
182
+ // tolerable by a best-effort reconciliation. And optionally creates a new
183
+ // WriteBatch from the original WriteBatch that is consistent with the running
184
+ // timestamp size. Column families referred to in the WriteBatch but are
185
+ // dropped are ignored. If a new WriteBatch is created, such entries are
186
+ // copied over as is.
187
+ kReconcileInconsistency,
188
+ };
189
+
190
+ // Handles the inconsistency between recorded timestamp sizes and running
191
+ // timestamp sizes for a WriteBatch. A non-OK `status` indicates there are
192
+ // intolerable inconsistency with the specified `check_mode`.
193
+ //
194
+ // If `check_mode` is `kVerifyConsistency`, intolerable inconsistency means any
195
+ // running column family has an inconsistent user-defined timestamp size.
196
+ //
197
+ // If `check_mode` is `kReconcileInconsistency`, intolerable inconsistency means
198
+ // any running column family has an inconsistent user-defined timestamp size
199
+ // that cannot be reconciled with a best-effort recovery. Check
200
+ // `TimestampRecoveryHandler` for what a best-effort recovery is capable of. In
201
+ // this mode, output argument `new_batch` should be set, a new WriteBatch is
202
+ // created on the heap and transferred to `new_batch` if there is tolerable
203
+ // inconsistency.
204
+ //
205
+ // An invariant that WAL logging ensures is that all timestamp size info
206
+ // is logged prior to a WriteBatch that needed this info. And zero timestamp
207
+ // size is skipped. So `record_ts_sz` only contains column family with non-zero
208
+ // timestamp size and a column family id absent from `record_ts_sz` will be
209
+ // interpreted as that column family has zero timestamp size. On the other hand,
210
+ // `running_ts_sz` should contain the timestamp size for all running column
211
+ // families including the ones with zero timestamp size.
212
+ Status HandleWriteBatchTimestampSizeDifference(
213
+ const WriteBatch* batch,
214
+ const UnorderedMap<uint32_t, size_t>& running_ts_sz,
215
+ const UnorderedMap<uint32_t, size_t>& record_ts_sz,
216
+ TimestampSizeConsistencyMode check_mode,
217
+ std::unique_ptr<WriteBatch>* new_batch = nullptr);
218
+
219
+ // This util function is used when opening an existing column family and
220
+ // processing its VersionEdit. It does a sanity check for the column family's
221
+ // old user comparator and the persist_user_defined_timestamps flag as recorded
222
+ // in the VersionEdit, against its new settings from the column family's
223
+ // ImmutableCFOptions.
224
+ //
225
+ // Valid settings change include:
226
+ // 1) no user comparator change and no effective persist_user_defined_timestamp
227
+ // flag change.
228
+ // 2) switch user comparator to enable user-defined timestamps feature provided
229
+ // the immediately effective persist_user_defined_timestamps flag is false.
230
+ // 3) switch user comparator to disable user-defined timestamps feature provided
231
+ // that the before-change persist_user_defined_timestamps is already false.
232
+ //
233
+ // Switch user comparator to disable/enable UDT is only sanity checked by a user
234
+ // comparator name comparison. The full check includes enforcing the new user
235
+ // comparator ranks user keys exactly the same as the old user comparator and
236
+ // only add / remove the user-defined timestamp comparison. We don't have ways
237
+ // to strictly enforce this so currently only the RocksDB builtin comparator
238
+ // wrapper `ComparatorWithU64TsImpl` is supported to enable / disable
239
+ // user-defined timestamps. It formats user-defined timestamps as uint64_t.
240
+ //
241
+ // When the settings indicate a legit change to enable user-defined timestamps
242
+ // feature on a column family, `mark_sst_files_has_no_udt` will be set to true
243
+ // to indicate marking all existing SST files has no user-defined timestamps
244
+ // when re-writing the manifest.
245
+ Status ValidateUserDefinedTimestampsOptions(
246
+ const Comparator* new_comparator, const std::string& old_comparator_name,
247
+ bool new_persist_udt, bool old_persist_udt,
248
+ bool* mark_sst_files_has_no_udt);
77
249
  } // namespace ROCKSDB_NAMESPACE