@nxtedition/rocksdb 15.4.0 → 15.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (402) hide show
  1. package/binding.cc +24 -19
  2. package/cache.js +1 -1
  3. package/chained-batch.js +12 -3
  4. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  5. package/deps/rocksdb/rocksdb/BUCK +42 -0
  6. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  7. package/deps/rocksdb/rocksdb/Makefile +59 -32
  8. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  9. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  10. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  11. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  12. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  13. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  15. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  19. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  26. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  28. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  29. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  31. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  33. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  34. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  53. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  54. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  55. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  57. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  58. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  59. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  60. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  61. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  62. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  63. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  64. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  65. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  66. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  67. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  68. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  81. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  82. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  83. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  84. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  85. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  86. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  87. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  88. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  89. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  90. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  91. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  92. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  93. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  94. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  95. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  96. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  97. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  98. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  99. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  100. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  101. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  102. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  103. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  105. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  106. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  107. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  110. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  111. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  112. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  113. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  115. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  116. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  118. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  119. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  120. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  121. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  122. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  123. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  124. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  125. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  126. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  127. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  128. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  129. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  130. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  131. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  132. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  133. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  134. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  135. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  136. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  137. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  138. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  139. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  140. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  141. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  142. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  143. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  144. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  145. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  146. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  147. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  148. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  150. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  151. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  152. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  153. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  160. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  161. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  162. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  163. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  164. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  165. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  166. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  167. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  168. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  169. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  171. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  173. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  174. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  175. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  176. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  177. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  180. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  181. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  182. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  183. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  184. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  185. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  187. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  188. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  189. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  192. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  194. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  197. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  198. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  199. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  200. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  202. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  204. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  205. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  206. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  210. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  211. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  212. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  213. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  214. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  215. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  216. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  217. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  218. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  219. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  220. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  221. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  222. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  223. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  224. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  225. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  226. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  227. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  228. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  229. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  230. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  231. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  232. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  233. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  234. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  235. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  236. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  237. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  238. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  239. package/deps/rocksdb/rocksdb/src.mk +12 -0
  240. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  241. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  243. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  253. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  254. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  255. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  256. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  257. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  258. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  259. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  260. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  261. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  263. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  264. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  265. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  266. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  267. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  268. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  269. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  270. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  273. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  274. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  275. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  276. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  277. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  278. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  279. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  280. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  281. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  282. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  283. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  284. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  286. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  287. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  288. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  289. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  290. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  291. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  292. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  293. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  294. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  295. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  296. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  297. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  298. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  299. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  300. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  301. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  302. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  303. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  304. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  305. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  306. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  307. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  308. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  309. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  310. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  311. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  312. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  313. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  314. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  315. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  316. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  317. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  318. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  319. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  320. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  321. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  322. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  323. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  324. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  325. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  326. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  328. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  329. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  331. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  332. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  333. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  334. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  335. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  336. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  337. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  338. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  339. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  340. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  341. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  342. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  343. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  344. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  355. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  356. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  358. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  360. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  361. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  362. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  364. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  365. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  366. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  367. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  368. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  369. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  370. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  371. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  373. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  375. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  376. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  377. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  378. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  380. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  381. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  388. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  389. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  390. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  391. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  392. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  393. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  394. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  395. package/deps/rocksdb/rocksdb.gyp +7 -0
  396. package/index.js +11 -2
  397. package/iterator.js +15 -7
  398. package/package.json +1 -1
  399. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  400. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  402. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -73,7 +73,7 @@ Status OptimisticTransactionDB::Open(
73
73
  std::vector<ColumnFamilyHandle*>* handles,
74
74
  OptimisticTransactionDB** dbptr) {
75
75
  Status s;
76
- DB* db;
76
+ std::unique_ptr<DB> db;
77
77
 
78
78
  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
79
79
 
@@ -91,7 +91,7 @@ Status OptimisticTransactionDB::Open(
91
91
  s = DB::Open(db_options, dbname, column_families_copy, handles, &db);
92
92
 
93
93
  if (s.ok()) {
94
- *dbptr = new OptimisticTransactionDBImpl(db, occ_options);
94
+ *dbptr = new OptimisticTransactionDBImpl(std::move(db), occ_options);
95
95
  }
96
96
 
97
97
  return s;
@@ -44,10 +44,9 @@ class OccLockBucketsImpl : public OccLockBucketsImplBase {
44
44
  class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
45
45
  public:
46
46
  explicit OptimisticTransactionDBImpl(
47
- DB* db, const OptimisticTransactionDBOptions& occ_options,
48
- bool take_ownership = true)
49
- : OptimisticTransactionDB(db),
50
- db_owner_(take_ownership),
47
+ std::unique_ptr<DB>&& db,
48
+ const OptimisticTransactionDBOptions& occ_options)
49
+ : OptimisticTransactionDB(std::move(db)),
51
50
  validate_policy_(occ_options.validate_policy) {
52
51
  if (validate_policy_ == OccValidationPolicy::kValidateParallel) {
53
52
  auto bucketed_locks = occ_options.shared_lock_buckets;
@@ -60,13 +59,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
60
59
  }
61
60
  }
62
61
 
63
- ~OptimisticTransactionDBImpl() {
64
- // Prevent this stackable from destroying
65
- // base db
66
- if (!db_owner_) {
67
- db_ = nullptr;
68
- }
69
- }
62
+ ~OptimisticTransactionDBImpl() override = default;
70
63
 
71
64
  Transaction* BeginTransaction(const WriteOptions& write_options,
72
65
  const OptimisticTransactionOptions& txn_options,
@@ -97,8 +90,6 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
97
90
  private:
98
91
  std::shared_ptr<OccLockBucketsImplBase> bucketed_locks_;
99
92
 
100
- bool db_owner_;
101
-
102
93
  const OccValidationPolicy validate_policy_;
103
94
 
104
95
  void ReinitializeTransaction(Transaction* txn,
@@ -9005,7 +9005,7 @@ class CommitBypassMemtableTest
9005
9005
  txn_db_opts.use_per_key_point_lock_mgr = std::get<1>(GetParam());
9006
9006
  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
9007
9007
  ASSERT_NE(txn_db, nullptr);
9008
- db_ = txn_db;
9008
+ db_.reset(txn_db);
9009
9009
  }
9010
9010
  };
9011
9011
 
@@ -9453,9 +9453,9 @@ TEST_P(CommitBypassMemtableTest, Recovery) {
9453
9453
  VerifyDBFromMap(expected);
9454
9454
 
9455
9455
  ASSERT_OK(txn_db->Close());
9456
- delete txn_db;
9456
+ db_.reset(); // destroys txn_db (owned by db_)
9457
9457
  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
9458
- db_ = txn_db;
9458
+ db_.reset(txn_db);
9459
9459
 
9460
9460
  VerifyDBFromMap(expected);
9461
9461
  }
@@ -82,6 +82,12 @@ class TransactionTestBase : public ::testing::Test {
82
82
  txn_db_options.write_policy = write_policy;
83
83
  txn_db_options.rollback_merge_operands = true;
84
84
  txn_db_options.use_per_key_point_lock_mgr = use_per_key_point_lock_mgr;
85
+ // Reduce commit cache size from the default 2^23 (64MB) to 2^13 (64KB).
86
+ // The default is sized for production workloads but makes TSAN builds
87
+ // very slow because value-initializing 8M atomics triggers __tsan_memset,
88
+ // which updates shadow memory for every 8-byte cell. Tests that need
89
+ // specific cache sizes (e.g., for wrapping/eviction) override this.
90
+ txn_db_options.wp_commit_cache_bits = 13;
85
91
  // This will stress write unprepared, by forcing write batch flush on every
86
92
  // write.
87
93
  txn_db_options.default_write_batch_flush_threshold = 1;
@@ -0,0 +1,431 @@
1
+ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+
6
+ // Test to verify that sequence numbers remain consistent during error recovery
7
+ // with WritePrepared TransactionDB and two_write_queues=true.
8
+ //
9
+ // The fix: SyncLastSequenceWithAllocated() is called during ResumeImpl to
10
+ // ensure that allocated-but-not-published sequence numbers are accounted for
11
+ // before creating new memtables/WALs, preventing "sequence number going
12
+ // backwards" corruption on subsequent recovery.
13
+
14
+ #include <atomic>
15
+ #include <memory>
16
+ #include <string>
17
+
18
+ #include "db/db_impl/db_impl.h"
19
+ #include "db/db_test_util.h"
20
+ #include "db/version_set.h"
21
+ #include "port/stack_trace.h"
22
+ #include "rocksdb/utilities/transaction_db.h"
23
+ #include "test_util/sync_point.h"
24
+ #include "test_util/testharness.h"
25
+ #include "test_util/testutil.h"
26
+ #include "utilities/fault_injection_fs.h"
27
+
28
+ namespace ROCKSDB_NAMESPACE {
29
+
30
+ class WritePreparedTransactionSeqnoTest : public ::testing::Test {
31
+ public:
32
+ WritePreparedTransactionSeqnoTest()
33
+ : db_(nullptr),
34
+ special_env_(Env::Default()),
35
+ fault_fs_(new FaultInjectionTestFS(FileSystem::Default())),
36
+ env_(new CompositeEnvWrapper(&special_env_, fault_fs_)) {
37
+ options_.create_if_missing = true;
38
+ options_.max_write_buffer_number = 2;
39
+ options_.write_buffer_size = 4 * 1024;
40
+ options_.level0_file_num_compaction_trigger = 2;
41
+ options_.env = env_.get();
42
+ // Use two_write_queues which is typical for WritePrepared
43
+ options_.two_write_queues = true;
44
+ // Enable auto recovery from retryable errors
45
+ options_.max_bgerror_resume_count = 2;
46
+ options_.bgerror_resume_retry_interval = 100000; // 100ms
47
+
48
+ dbname_ = test::PerThreadDBPath("write_prepared_seqno_test");
49
+ EXPECT_OK(DestroyDB(dbname_, options_));
50
+
51
+ txn_db_options_.transaction_lock_timeout = 0;
52
+ txn_db_options_.default_lock_timeout = 0;
53
+ txn_db_options_.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
54
+ }
55
+
56
+ ~WritePreparedTransactionSeqnoTest() {
57
+ SyncPoint::GetInstance()->DisableProcessing();
58
+ SyncPoint::GetInstance()->ClearAllCallBacks();
59
+ if (db_) {
60
+ for (auto h : handles_) {
61
+ if (h) {
62
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
63
+ }
64
+ }
65
+ handles_.clear();
66
+ delete db_;
67
+ db_ = nullptr;
68
+ }
69
+ }
70
+
71
+ Status Open() {
72
+ return TransactionDB::Open(options_, txn_db_options_, dbname_, &db_);
73
+ }
74
+
75
+ void Close() {
76
+ for (auto h : handles_) {
77
+ if (h) {
78
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
79
+ }
80
+ }
81
+ handles_.clear();
82
+ delete db_;
83
+ db_ = nullptr;
84
+ }
85
+
86
+ DBImpl* dbimpl() { return static_cast_with_check<DBImpl>(db_->GetRootDB()); }
87
+
88
+ protected:
89
+ TransactionDB* db_;
90
+ SpecialEnv special_env_;
91
+ std::shared_ptr<FaultInjectionTestFS> fault_fs_;
92
+ std::unique_ptr<Env> env_;
93
+ std::string dbname_;
94
+ Options options_;
95
+ TransactionDBOptions txn_db_options_;
96
+ std::vector<ColumnFamilyHandle*> handles_;
97
+ };
98
+
99
+ // Regression test: verify that after error recovery with two_write_queues,
100
+ // the DB can be closed and reopened without sequence number corruption.
101
+ TEST_F(WritePreparedTransactionSeqnoTest,
102
+ SeqnoGoesBackwardsDuringErrorRecovery) {
103
+ ASSERT_OK(Open());
104
+
105
+ // Write some initial data and flush to establish baseline
106
+ WriteOptions write_opts;
107
+ TransactionOptions txn_opts;
108
+ for (int i = 0; i < 10; i++) {
109
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
110
+ ASSERT_NE(txn, nullptr);
111
+ ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
112
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
113
+ ASSERT_OK(txn->Prepare());
114
+ ASSERT_OK(txn->Commit());
115
+ delete txn;
116
+ }
117
+ ASSERT_OK(db_->Flush(FlushOptions()));
118
+
119
+ // Write more data - these will allocate sequence numbers
120
+ for (int i = 10; i < 20; i++) {
121
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
122
+ ASSERT_NE(txn, nullptr);
123
+ ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
124
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
125
+ ASSERT_OK(txn->Prepare());
126
+ ASSERT_OK(txn->Commit());
127
+ delete txn;
128
+ }
129
+
130
+ // Set up sync point dependency chain for deterministic recovery
131
+ // synchronization, following the pattern from
132
+ // ManifestWriteRetryableErrorAutoRecover in error_handler_fs_test.cc.
133
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
134
+ {{"RecoverFromRetryableBGIOError:BeforeStart",
135
+ "SeqnoGoesBackwardsDuringErrorRecovery:0"},
136
+ {"SeqnoGoesBackwardsDuringErrorRecovery:1",
137
+ "RecoverFromRetryableBGIOError:BeforeWait1"},
138
+ {"RecoverFromRetryableBGIOError:RecoverSuccess",
139
+ "SeqnoGoesBackwardsDuringErrorRecovery:2"}});
140
+
141
+ // Inject a retryable MANIFEST write error on the next flush
142
+ IOStatus error_to_inject = IOStatus::IOError("Injected MANIFEST error");
143
+ error_to_inject.SetRetryable(true);
144
+ SyncPoint::GetInstance()->SetCallBack(
145
+ "VersionSet::LogAndApply:WriteManifest",
146
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
147
+ SyncPoint::GetInstance()->EnableProcessing();
148
+
149
+ // Trigger a flush that will fail due to MANIFEST write error
150
+ Status s = db_->Flush(FlushOptions());
151
+ ASSERT_NOK(s);
152
+
153
+ // Wait for recovery to start, then re-enable filesystem and let it proceed.
154
+ // Clear the callback first to prevent it from re-disabling the filesystem
155
+ // if recovery's ResumeImpl triggers WriteManifest before we re-enable.
156
+ TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:0");
157
+ SyncPoint::GetInstance()->ClearCallBack(
158
+ "VersionSet::LogAndApply:WriteManifest");
159
+ fault_fs_->SetFilesystemActive(true);
160
+ TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:1");
161
+
162
+ // Wait for recovery to complete
163
+ TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:2");
164
+ SyncPoint::GetInstance()->DisableProcessing();
165
+
166
+ // Write some more data after recovery
167
+ for (int i = 20; i < 30; i++) {
168
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
169
+ ASSERT_NE(txn, nullptr);
170
+ ASSERT_OK(txn->SetName("txn_after_" + std::to_string(i)));
171
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
172
+ ASSERT_OK(txn->Prepare());
173
+ ASSERT_OK(txn->Commit());
174
+ delete txn;
175
+ }
176
+
177
+ // Close and reopen - this would fail with "sequence number going backwards"
178
+ // before the fix.
179
+ Close();
180
+
181
+ Status reopen_s = Open();
182
+ ASSERT_OK(reopen_s);
183
+
184
+ // Verify data integrity
185
+ ReadOptions read_opts;
186
+ for (int i = 0; i < 20; i++) {
187
+ std::string value;
188
+ ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
189
+ ASSERT_EQ(value, "value" + std::to_string(i));
190
+ }
191
+
192
+ Close();
193
+ }
194
+
195
+ // Test that verifies the sequence number discrepancy is resolved by checking
196
+ // that LastSequence >= LastAllocatedSequence after recovery completes.
197
+ TEST_F(WritePreparedTransactionSeqnoTest, SeqnoDiscrepancyDuringErrorRecovery) {
198
+ ASSERT_OK(Open());
199
+
200
+ WriteOptions write_opts;
201
+ TransactionOptions txn_opts;
202
+
203
+ // Write initial data and flush
204
+ for (int i = 0; i < 5; i++) {
205
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
206
+ ASSERT_NE(txn, nullptr);
207
+ ASSERT_OK(txn->SetName("init_txn" + std::to_string(i)));
208
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
209
+ ASSERT_OK(txn->Prepare());
210
+ ASSERT_OK(txn->Commit());
211
+ delete txn;
212
+ }
213
+ ASSERT_OK(db_->Flush(FlushOptions()));
214
+
215
+ // Write more transactions with two_write_queues to potentially create a gap
216
+ // between allocated and published sequence numbers. These must be written
217
+ // before installing the error injection callback, since the small write
218
+ // buffer (4KB) could trigger an automatic flush during these writes.
219
+ for (int i = 5; i < 10; i++) {
220
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
221
+ ASSERT_NE(txn, nullptr);
222
+ ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
223
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
224
+ ASSERT_OK(txn->Prepare());
225
+ ASSERT_OK(txn->Commit());
226
+ delete txn;
227
+ }
228
+
229
+ // Track sequence numbers at key points
230
+ std::atomic<uint64_t> last_seq_after_recovery{0};
231
+ std::atomic<uint64_t> last_allocated_seq_after_recovery{0};
232
+ std::atomic<bool> captured_seqs_after{false};
233
+
234
+ IOStatus error_to_inject = IOStatus::IOError("Injected error");
235
+ error_to_inject.SetRetryable(true);
236
+
237
+ // Set up sync point dependency chain for deterministic synchronization
238
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
239
+ {{"RecoverFromRetryableBGIOError:BeforeStart",
240
+ "SeqnoDiscrepancyDuringErrorRecovery:0"},
241
+ {"SeqnoDiscrepancyDuringErrorRecovery:1",
242
+ "RecoverFromRetryableBGIOError:BeforeWait1"},
243
+ {"RecoverFromRetryableBGIOError:RecoverSuccess",
244
+ "SeqnoDiscrepancyDuringErrorRecovery:2"}});
245
+
246
+ SyncPoint::GetInstance()->SetCallBack(
247
+ "VersionSet::LogAndApply:WriteManifest",
248
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
249
+
250
+ // Capture sequence numbers after recovery completes to verify the fix
251
+ SyncPoint::GetInstance()->SetCallBack(
252
+ "RecoverFromRetryableBGIOError:RecoverSuccess", [&](void*) {
253
+ DBImpl* db_impl = dbimpl();
254
+ if (db_impl) {
255
+ VersionSet* vs = db_impl->GetVersionSet();
256
+ if (vs) {
257
+ last_seq_after_recovery.store(vs->LastSequence());
258
+ last_allocated_seq_after_recovery.store(
259
+ vs->LastAllocatedSequence());
260
+ captured_seqs_after.store(true);
261
+ }
262
+ }
263
+ });
264
+
265
+ SyncPoint::GetInstance()->EnableProcessing();
266
+
267
+ // Trigger a flush that will fail
268
+ Status flush_s = db_->Flush(FlushOptions());
269
+ ASSERT_NOK(flush_s);
270
+
271
+ // Wait for recovery to start, re-enable filesystem, let it proceed.
272
+ // Clear the callback first to prevent it from re-disabling the filesystem
273
+ // if recovery's ResumeImpl triggers WriteManifest before we re-enable.
274
+ TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:0");
275
+ SyncPoint::GetInstance()->ClearCallBack(
276
+ "VersionSet::LogAndApply:WriteManifest");
277
+ fault_fs_->SetFilesystemActive(true);
278
+ TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:1");
279
+
280
+ // Wait for recovery to complete
281
+ TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:2");
282
+ SyncPoint::GetInstance()->DisableProcessing();
283
+
284
+ // Verify that sequences were captured and are in sync after recovery
285
+ ASSERT_TRUE(captured_seqs_after.load());
286
+ ASSERT_GE(last_seq_after_recovery.load(),
287
+ last_allocated_seq_after_recovery.load())
288
+ << "LastSequence should be >= LastAllocatedSequence after recovery";
289
+
290
+ // Close and reopen should succeed without corruption
291
+ Close();
292
+ ASSERT_OK(Open());
293
+
294
+ // Verify data integrity
295
+ ReadOptions read_opts;
296
+ for (int i = 0; i < 10; i++) {
297
+ std::string value;
298
+ ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
299
+ ASSERT_EQ(value, "value" + std::to_string(i));
300
+ }
301
+
302
+ Close();
303
+ }
304
+
305
+ // Test that verifies SyncLastSequenceWithAllocated is called during ResumeImpl
306
+ // by checking sequence numbers before and after the sync point.
307
+ TEST_F(WritePreparedTransactionSeqnoTest, ConcurrentWritesDuringErrorRecovery) {
308
+ ASSERT_OK(Open());
309
+
310
+ WriteOptions write_opts;
311
+ TransactionOptions txn_opts;
312
+
313
+ // Write initial data and flush
314
+ for (int i = 0; i < 5; i++) {
315
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
316
+ ASSERT_NE(txn, nullptr);
317
+ ASSERT_OK(txn->SetName("init_txn" + std::to_string(i)));
318
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
319
+ ASSERT_OK(txn->Prepare());
320
+ ASSERT_OK(txn->Commit());
321
+ delete txn;
322
+ }
323
+ ASSERT_OK(db_->Flush(FlushOptions()));
324
+
325
+ // Write more transactions. These must be written before installing the error
326
+ // injection callback, since the small write buffer (4KB) could trigger an
327
+ // automatic flush during these writes.
328
+ for (int i = 5; i < 10; i++) {
329
+ Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
330
+ ASSERT_NE(txn, nullptr);
331
+ ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
332
+ ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
333
+ ASSERT_OK(txn->Prepare());
334
+ ASSERT_OK(txn->Commit());
335
+ delete txn;
336
+ }
337
+
338
+ // Track sequence numbers at key points during recovery
339
+ std::atomic<uint64_t> seq_before_resume{0};
340
+ std::atomic<uint64_t> alloc_seq_before_resume{0};
341
+ std::atomic<uint64_t> seq_after_resume{0};
342
+ std::atomic<uint64_t> alloc_seq_after_resume{0};
343
+
344
+ IOStatus error_to_inject = IOStatus::IOError("Injected error");
345
+ error_to_inject.SetRetryable(true);
346
+
347
+ // Set up sync point dependency chain for deterministic synchronization
348
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
349
+ {{"RecoverFromRetryableBGIOError:BeforeStart",
350
+ "ConcurrentWritesDuringErrorRecovery:0"},
351
+ {"ConcurrentWritesDuringErrorRecovery:1",
352
+ "RecoverFromRetryableBGIOError:BeforeWait1"},
353
+ {"RecoverFromRetryableBGIOError:RecoverSuccess",
354
+ "ConcurrentWritesDuringErrorRecovery:2"}});
355
+
356
+ SyncPoint::GetInstance()->SetCallBack(
357
+ "VersionSet::LogAndApply:WriteManifest",
358
+ [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
359
+
360
+ // Capture sequences right before ResumeImpl runs the sync
361
+ SyncPoint::GetInstance()->SetCallBack("DBImpl::ResumeImpl:Start", [&](void*) {
362
+ DBImpl* db_impl = dbimpl();
363
+ if (db_impl) {
364
+ VersionSet* vs = db_impl->GetVersionSet();
365
+ if (vs) {
366
+ seq_before_resume.store(vs->LastSequence());
367
+ alloc_seq_before_resume.store(vs->LastAllocatedSequence());
368
+ }
369
+ }
370
+ });
371
+
372
+ // Capture sequences right after ResumeImpl syncs them
373
+ SyncPoint::GetInstance()->SetCallBack(
374
+ "DBImpl::ResumeImpl:AfterSyncSeq", [&](void*) {
375
+ DBImpl* db_impl = dbimpl();
376
+ if (db_impl) {
377
+ VersionSet* vs = db_impl->GetVersionSet();
378
+ if (vs) {
379
+ seq_after_resume.store(vs->LastSequence());
380
+ alloc_seq_after_resume.store(vs->LastAllocatedSequence());
381
+ }
382
+ }
383
+ });
384
+
385
+ SyncPoint::GetInstance()->EnableProcessing();
386
+
387
+ // Trigger a flush that will fail
388
+ Status flush_s = db_->Flush(FlushOptions());
389
+ ASSERT_NOK(flush_s);
390
+
391
+ // Wait for recovery to start, re-enable filesystem, let it proceed.
392
+ // Clear the callback first to prevent it from re-disabling the filesystem
393
+ // if recovery's ResumeImpl triggers WriteManifest before we re-enable.
394
+ TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:0");
395
+ SyncPoint::GetInstance()->ClearCallBack(
396
+ "VersionSet::LogAndApply:WriteManifest");
397
+ fault_fs_->SetFilesystemActive(true);
398
+ TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:1");
399
+
400
+ // Wait for recovery to complete
401
+ TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:2");
402
+ SyncPoint::GetInstance()->DisableProcessing();
403
+
404
+ // Verify that the AfterSyncSeq callback fired and sequences are in sync
405
+ ASSERT_GT(seq_after_resume.load(), 0u)
406
+ << "DBImpl::ResumeImpl:AfterSyncSeq callback should have fired";
407
+ ASSERT_EQ(seq_after_resume.load(), alloc_seq_after_resume.load())
408
+ << "Fix should have synced sequences";
409
+
410
+ // Close and reopen
411
+ Close();
412
+ ASSERT_OK(Open());
413
+
414
+ // Verify data integrity
415
+ ReadOptions read_opts;
416
+ for (int i = 0; i < 10; i++) {
417
+ std::string value;
418
+ ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
419
+ ASSERT_EQ(value, "value" + std::to_string(i));
420
+ }
421
+
422
+ Close();
423
+ }
424
+
425
+ } // namespace ROCKSDB_NAMESPACE
426
+
427
+ int main(int argc, char** argv) {
428
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
429
+ ::testing::InitGoogleTest(&argc, argv);
430
+ return RUN_ALL_TESTS();
431
+ }
@@ -196,7 +196,7 @@ TEST(PreparedHeap, Concurrent) {
196
196
  TEST(WriteBatchWithIndex, SubBatchCnt) {
197
197
  ColumnFamilyOptions cf_options;
198
198
  std::string cf_name = "two";
199
- DB* db;
199
+ std::unique_ptr<DB> db;
200
200
  Options options;
201
201
  options.create_if_missing = true;
202
202
  const std::string dbname = test::PerThreadDBPath("transaction_testdb");
@@ -285,7 +285,6 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
285
285
  }
286
286
 
287
287
  delete cf_handle;
288
- delete db;
289
288
  }
290
289
 
291
290
  TEST(CommitEntry64b, BasicTest) {
@@ -36,6 +36,97 @@ class WritePreparedTxnDB;
36
36
  // committed data from uncommitted data. Uncommitted data could be after the
37
37
  // Prepare phase in 2PC (WritePreparedTxn) or before that
38
38
  // (WriteUnpreparedTxnImpl).
39
+ //
40
+ // == Concrete example: WritePrepared 2PC transaction ==
41
+ //
42
+ // User code:
43
+ //
44
+ // Transaction* txn = db->BeginTransaction(write_opts, txn_opts);
45
+ // txn->SetName("txn1");
46
+ // txn->Put("key1", "value1"); // buffered in WriteBatch, nothing written
47
+ // yet txn->Prepare(); // Phase 1 txn->Commit(); // Phase 2
48
+ //
49
+ // -- Phase 1: Prepare (PrepareInternal) --
50
+ //
51
+ // The Prepare call (write_prepared_txn.cc PrepareInternal) calls:
52
+ //
53
+ // db_impl_->WriteImpl(write_options, GetWriteBatch(),
54
+ // ..., !DISABLE_MEMTABLE, ...);
55
+ //
56
+ // !DISABLE_MEMTABLE is false — memtable is enabled. This is the defining
57
+ // characteristic of "WritePrepared": the actual data (Put("key1", "value1"))
58
+ // is written to the memtable at Prepare time.
59
+ //
60
+ // Because disable_memtable == false, the routing check at
61
+ // db_impl_write.cc:502 is not taken. The write goes through the main write
62
+ // queue (write_thread_), which handles both WAL and memtable:
63
+ //
64
+ // Destination | What gets written | Sequence
65
+ // ------------|--------------------------------------------|-----------
66
+ // WAL | Put(key1, value1) + EndPrepare(txn1) | prepare_seq
67
+ // Memtable | Put(key1, value1) | prepare_seq
68
+ //
69
+ // The data is now durable (WAL) and in the memtable, but not yet visible
70
+ // to readers. Readers use GetLastPublishedSequence() which consults a
71
+ // commit map — since prepare_seq is in the PreparedHeap but not yet in the
72
+ // CommitCache, readers know this data is uncommitted and skip it.
73
+ //
74
+ // -- Phase 2: Commit (CommitInternal) --
75
+ //
76
+ // The Commit call (write_prepared_txn.cc CommitInternal) calls:
77
+ //
78
+ // db_impl_->WriteImpl(write_options_, working_batch,
79
+ // ..., disable_memtable, ...);
80
+ //
81
+ // In the typical case (do_one_write == true, i.e., the commit-time batch
82
+ // is empty or has no data), disable_memtable is true. Now the routing
83
+ // check at db_impl_write.cc:502 is taken:
84
+ //
85
+ // if (two_write_queues_ && disable_memtable) {
86
+ // return WriteImplWALOnly(&nonmem_write_thread_, ...);
87
+ // }
88
+ //
89
+ // The commit goes through the second write queue (nonmem_write_thread_),
90
+ // WAL only:
91
+ //
92
+ // Destination | What gets written | Sequence
93
+ // ------------|---------------------|-----------
94
+ // WAL | Commit(txn1) marker | commit_seq
95
+ // Memtable | Nothing | —
96
+ //
97
+ // The PreReleaseCallback (WritePreparedCommitEntryPreReleaseCallback)
98
+ // updates the CommitCache to record that prepare_seq was committed at
99
+ // commit_seq. After this, readers consulting the commit map will see that
100
+ // the data at prepare_seq is committed and therefore visible.
101
+ //
102
+ // -- Why two queues help --
103
+ //
104
+ // The Commit phase doesn't touch the memtable — it only writes a small
105
+ // marker to WAL and updates an in-memory commit map. By routing this
106
+ // through a separate queue, Commit writes don't have to wait behind other
107
+ // transactions' Prepare writes (which do the expensive memtable insertion
108
+ // on the main queue). This is the optimization mentioned in the options
109
+ // comment about MySQL 2PC where commits are serial.
110
+ //
111
+ // -- Sequence number flow --
112
+ //
113
+ // last_sequence_ | last_allocated_seq |
114
+ // last_published_seq
115
+ // ---------------|--------------------|-------------------
116
+ // Before Prepare: 9 | 9 | 9
117
+ //
118
+ // Prepare (main queue):
119
+ // FetchAdd alloc seq 9 | 10 | 9
120
+ // Write WAL + memtable
121
+ // SetLastSequence 10 | 10 | 9
122
+ // (published_seq not advanced yet — data is uncommitted)
123
+ //
124
+ // Commit (2nd queue):
125
+ // FetchAdd alloc seq 10 | 11 | 9
126
+ // Write WAL only
127
+ // Update CommitCache
128
+ // SetLastPublishedSeq 10 | 11 | 11
129
+ //
39
130
  class WritePreparedTxn : public PessimisticTransaction {
40
131
  public:
41
132
  WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,