@nxtedition/rocksdb 15.4.0 → 15.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (402) hide show
  1. package/binding.cc +24 -19
  2. package/cache.js +1 -1
  3. package/chained-batch.js +12 -3
  4. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  5. package/deps/rocksdb/rocksdb/BUCK +42 -0
  6. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  7. package/deps/rocksdb/rocksdb/Makefile +59 -32
  8. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  9. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  10. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  11. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  12. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  13. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  15. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  19. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  26. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  28. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  29. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  31. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  33. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  34. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  53. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  54. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  55. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  57. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  58. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  59. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  60. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  61. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  62. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  63. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  64. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  65. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  66. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  67. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  68. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  81. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  82. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  83. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  84. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  85. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  86. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  87. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  88. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  89. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  90. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  91. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  92. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  93. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  94. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  95. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  96. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  97. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  98. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  99. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  100. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  101. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  102. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  103. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  105. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  106. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  107. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  110. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  111. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  112. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  113. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  115. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  116. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  118. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  119. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  120. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  121. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  122. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  123. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  124. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  125. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  126. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  127. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  128. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  129. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  130. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  131. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  132. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  133. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  134. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  135. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  136. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  137. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  138. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  139. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  140. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  141. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  142. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  143. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  144. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  145. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  146. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  147. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  148. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  150. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  151. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  152. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  153. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  160. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  161. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  162. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  163. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  164. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  165. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  166. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  167. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  168. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  169. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  171. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  173. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  174. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  175. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  176. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  177. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  180. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  181. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  182. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  183. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  184. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  185. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  187. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  188. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  189. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  192. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  194. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  197. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  198. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  199. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  200. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  202. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  204. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  205. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  206. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  210. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  211. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  212. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  213. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  214. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  215. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  216. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  217. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  218. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  219. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  220. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  221. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  222. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  223. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  224. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  225. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  226. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  227. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  228. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  229. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  230. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  231. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  232. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  233. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  234. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  235. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  236. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  237. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  238. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  239. package/deps/rocksdb/rocksdb/src.mk +12 -0
  240. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  241. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  243. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  253. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  254. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  255. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  256. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  257. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  258. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  259. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  260. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  261. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  263. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  264. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  265. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  266. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  267. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  268. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  269. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  270. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  273. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  274. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  275. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  276. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  277. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  278. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  279. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  280. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  281. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  282. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  283. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  284. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  286. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  287. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  288. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  289. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  290. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  291. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  292. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  293. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  294. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  295. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  296. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  297. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  298. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  299. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  300. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  301. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  302. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  303. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  304. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  305. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  306. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  307. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  308. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  309. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  310. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  311. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  312. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  313. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  314. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  315. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  316. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  317. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  318. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  319. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  320. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  321. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  322. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  323. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  324. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  325. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  326. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  328. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  329. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  331. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  332. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  333. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  334. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  335. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  336. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  337. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  338. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  339. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  340. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  341. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  342. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  343. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  344. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  355. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  356. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  358. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  360. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  361. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  362. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  364. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  365. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  366. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  367. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  368. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  369. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  370. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  371. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  373. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  375. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  376. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  377. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  378. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  380. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  381. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  388. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  389. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  390. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  391. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  392. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  393. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  394. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  395. package/deps/rocksdb/rocksdb.gyp +7 -0
  396. package/index.js +11 -2
  397. package/iterator.js +15 -7
  398. package/package.json +1 -1
  399. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  400. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  402. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -674,7 +674,7 @@ class PosixFileSystem : public FileSystem {
674
674
 
675
675
  IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
676
676
  uint64_t* size, IODebugContext* /*dbg*/) override {
677
- struct stat sbuf {};
677
+ struct stat sbuf{};
678
678
  if (stat(fname.c_str(), &sbuf) != 0) {
679
679
  *size = 0;
680
680
  return IOError("while stat a file for size", fname, errno);
@@ -981,7 +981,7 @@ class PosixFileSystem : public FileSystem {
981
981
  // file size. However this API only works on opened file.
982
982
  IOStatus GetFileSizeOnOpenedFile(const int fd, const std::string& name,
983
983
  uint64_t* size) {
984
- struct stat sb {};
984
+ struct stat sb{};
985
985
  *size = 0;
986
986
  // Get file information using fstat
987
987
  if (fstat(fd, &sb) == -1) {
@@ -1129,25 +1129,7 @@ class PosixFileSystem : public FileSystem {
1129
1129
  // Reset cqe data to catch any stray reuse of it
1130
1130
  static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
1131
1131
 
1132
- FSReadRequest req;
1133
- req.scratch = posix_handle->scratch;
1134
- req.offset = posix_handle->offset;
1135
- req.len = posix_handle->len;
1136
-
1137
- size_t finished_len = 0;
1138
- size_t bytes_read = 0;
1139
- bool read_again = false;
1140
- UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len,
1141
- true /*async_read*/, posix_handle->use_direct_io,
1142
- posix_handle->alignment, finished_len, &req, bytes_read,
1143
- read_again);
1144
- posix_handle->is_finished = true;
1145
- io_uring_cqe_seen(iu, cqe);
1146
- posix_handle->cb(req, posix_handle->cb_arg);
1147
-
1148
- (void)finished_len;
1149
- (void)bytes_read;
1150
- (void)read_again;
1132
+ FinalizeAsyncRead(iu, cqe, posix_handle);
1151
1133
 
1152
1134
  if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
1153
1135
  break;
@@ -1188,6 +1170,11 @@ class PosixFileSystem : public FileSystem {
1188
1170
  return IOStatus::IOError("");
1189
1171
  }
1190
1172
 
1173
+ // Mark this handle as being aborted. This is used when processing
1174
+ // completions to distinguish between aborted handles (expect 2
1175
+ // completions: original + cancel) and non-aborted handles (expect 1).
1176
+ posix_handle->is_being_aborted = true;
1177
+
1191
1178
  // Prepare the cancel request.
1192
1179
  struct io_uring_sqe* sqe;
1193
1180
  sqe = io_uring_get_sqe(iu);
@@ -1234,6 +1221,14 @@ class PosixFileSystem : public FileSystem {
1234
1221
  }
1235
1222
  posix_handle->req_count++;
1236
1223
 
1224
+ if (!posix_handle->is_being_aborted) {
1225
+ // This is a completion for a handle NOT being aborted.
1226
+ // It only has 1 outstanding request (the original read), so we
1227
+ // should finalize it now.
1228
+ FinalizeAsyncRead(iu, cqe, posix_handle);
1229
+ continue;
1230
+ }
1231
+
1237
1232
  // Reset cqe data to catch any stray reuse of it
1238
1233
  static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
1239
1234
  io_uring_cqe_seen(iu, cqe);
@@ -1247,16 +1242,23 @@ class PosixFileSystem : public FileSystem {
1247
1242
  // - And finally, if the request to cancel wasn't
1248
1243
  // found, the cancel request is completed with -ENOENT.
1249
1244
  //
1250
- // Every handle has to wait for 2 requests completion: original one and
1251
- // the cancel request which is tracked by PosixHandle::req_count.
1252
- if (posix_handle->req_count == 2 &&
1253
- static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
1245
+ // Every handle being aborted has to wait for 2 requests completion:
1246
+ // original one and the cancel request which is tracked by
1247
+ // PosixHandle::req_count.
1248
+ // Note: We must mark is_finished and invoke the callback for ANY handle
1249
+ // that reaches req_count == 2, not just the one we're currently waiting
1250
+ // for (io_handles[i]). Otherwise, if completions arrive out of order,
1251
+ // we consume another handle's completions without marking it finished,
1252
+ // causing an infinite hang when we later wait for that handle.
1253
+ if (posix_handle->req_count == 2) {
1254
1254
  posix_handle->is_finished = true;
1255
1255
  FSReadRequest req;
1256
1256
  req.status = IOStatus::Aborted();
1257
1257
  posix_handle->cb(req, posix_handle->cb_arg);
1258
1258
 
1259
- break;
1259
+ if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
1260
+ break;
1261
+ }
1260
1262
  }
1261
1263
  }
1262
1264
  }
@@ -1272,7 +1274,7 @@ class PosixFileSystem : public FileSystem {
1272
1274
  void SupportedOps(int64_t& supported_ops) override {
1273
1275
  supported_ops = 0;
1274
1276
  #if defined(ROCKSDB_IOURING_PRESENT)
1275
- if (IsIOUringEnabled()) {
1277
+ if (IsIOUringEnabled() && thread_local_async_read_io_urings_) {
1276
1278
  // Underlying FS supports async_io
1277
1279
  supported_ops |= (1 << FSSupportedOps::kAsyncIO);
1278
1280
  }
@@ -1338,9 +1340,8 @@ PosixFileSystem::PosixFileSystem()
1338
1340
  page_size_(getpagesize()),
1339
1341
  allow_non_owner_access_(true) {
1340
1342
  #if defined(ROCKSDB_IOURING_PRESENT)
1341
- // Test whether IOUring is supported, and if it does, create a managing
1342
- // object for thread local point so that in the future thread-local
1343
- // io_uring can be created.
1343
+ // Test whether IOUring is supported with the same flags that ReadAsync and
1344
+ // MultiRead will use at runtime.
1344
1345
  struct io_uring* new_io_uring = CreateIOUring();
1345
1346
  if (new_io_uring != nullptr) {
1346
1347
  thread_local_async_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
@@ -610,7 +610,7 @@ PosixRandomAccessFile::PosixRandomAccessFile(
610
610
  PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
611
611
 
612
612
  IOStatus PosixRandomAccessFile::GetFileSize(uint64_t* result) {
613
- struct stat sbuf {};
613
+ struct stat sbuf{};
614
614
  if (fstat(fd_, &sbuf) != 0) {
615
615
  *result = 0;
616
616
  return IOError("While fstat with fd " + std::to_string(fd_), filename_,
@@ -755,10 +755,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
755
755
  iu = static_cast<struct io_uring*>(
756
756
  thread_local_multi_read_io_urings_->Get());
757
757
  if (iu == nullptr) {
758
- unsigned int flags = 0;
759
- flags |= IORING_SETUP_SINGLE_ISSUER;
760
- flags |= IORING_SETUP_DEFER_TASKRUN;
761
- iu = CreateIOUring(flags);
758
+ iu = CreateIOUring();
762
759
  if (iu != nullptr) {
763
760
  thread_local_multi_read_io_urings_->Reset(iu);
764
761
  }
@@ -1090,10 +1087,7 @@ IOStatus PosixRandomAccessFile::ReadAsync(
1090
1087
  iu = static_cast<struct io_uring*>(
1091
1088
  thread_local_async_read_io_urings_->Get());
1092
1089
  if (iu == nullptr) {
1093
- unsigned int flags = 0;
1094
- flags |= IORING_SETUP_SINGLE_ISSUER;
1095
- flags |= IORING_SETUP_DEFER_TASKRUN;
1096
- iu = CreateIOUring(flags);
1090
+ iu = CreateIOUring();
1097
1091
  if (iu != nullptr) {
1098
1092
  thread_local_async_read_io_urings_->Reset(iu);
1099
1093
  }
@@ -1946,7 +1940,10 @@ IOStatus PosixDirectory::FsyncWithDirOptions(
1946
1940
  assert(fd_ >= 0); // Check use after close
1947
1941
  IOStatus s = IOStatus::OK();
1948
1942
  #ifndef OS_AIX
1949
- if (is_btrfs_) {
1943
+ bool test_is_btrfs = is_btrfs_;
1944
+ TEST_SYNC_POINT_CALLBACK("PosixDirectory::FsyncWithDirOptions:ForceBtrfs",
1945
+ &test_is_btrfs);
1946
+ if (test_is_btrfs) {
1950
1947
  // skip dir fsync for new file creation, which is not needed for btrfs
1951
1948
  if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
1952
1949
  return s;
@@ -1965,7 +1962,7 @@ IOStatus PosixDirectory::FsyncWithDirOptions(
1965
1962
  } else if (fsync(fd) < 0) {
1966
1963
  s = IOError("While fsync renaming file", new_name, errno);
1967
1964
  }
1968
- if (close(fd) < 0) {
1965
+ if (fd >= 0 && close(fd) < 0) {
1969
1966
  s = IOError("While closing file after fsync", new_name, errno);
1970
1967
  }
1971
1968
  return s;
@@ -127,6 +127,7 @@ struct Posix_IOHandle {
127
127
  use_direct_io(_use_direct_io),
128
128
  alignment(_alignment),
129
129
  is_finished(false),
130
+ is_being_aborted(false),
130
131
  req_count(0) {}
131
132
 
132
133
  struct iovec iov;
@@ -139,6 +140,10 @@ struct Posix_IOHandle {
139
140
  bool use_direct_io;
140
141
  size_t alignment;
141
142
  bool is_finished;
143
+ // is_being_aborted is set by AbortIO when a cancel request is submitted.
144
+ // Used to distinguish between aborted handles (expect 2 completions) and
145
+ // non-aborted handles (expect 1 completion) when processing completions.
146
+ bool is_being_aborted;
142
147
  // req_count is used by AbortIO API to keep track of number of requests.
143
148
  uint32_t req_count;
144
149
  };
@@ -197,6 +202,27 @@ inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
197
202
  (void)len;
198
203
  #endif
199
204
  }
205
+
206
+ // Finalize a completed async read request.
207
+ // Processes the CQE result, marks the handle as finished, and invokes the
208
+ // callback. This is shared between Poll and AbortIO (for non-aborted handles).
209
+ inline void FinalizeAsyncRead(struct io_uring* iu, struct io_uring_cqe* cqe,
210
+ Posix_IOHandle* posix_handle) {
211
+ FSReadRequest req;
212
+ req.scratch = posix_handle->scratch;
213
+ req.offset = posix_handle->offset;
214
+ req.len = posix_handle->len;
215
+
216
+ size_t finished_len = 0;
217
+ size_t bytes_read = 0;
218
+ bool read_again = false;
219
+ UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, true /*async_read*/,
220
+ posix_handle->use_direct_io, posix_handle->alignment,
221
+ finished_len, &req, bytes_read, read_again);
222
+ posix_handle->is_finished = true;
223
+ io_uring_cqe_seen(iu, cqe);
224
+ posix_handle->cb(req, posix_handle->cb_arg);
225
+ }
200
226
  #endif
201
227
 
202
228
  #ifdef OS_LINUX
@@ -307,8 +333,11 @@ inline void DeleteIOUring(void* p) {
307
333
  delete iu;
308
334
  }
309
335
 
310
- inline struct io_uring* CreateIOUring(unsigned int flags = 0) {
336
+ inline struct io_uring* CreateIOUring() {
311
337
  struct io_uring* new_io_uring = new struct io_uring;
338
+ unsigned int flags = 0;
339
+ flags |= IORING_SETUP_SINGLE_ISSUER;
340
+ flags |= IORING_SETUP_DEFER_TASKRUN;
312
341
  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags);
313
342
  if (ret) {
314
343
  delete new_io_uring;
@@ -3,11 +3,13 @@
3
3
  // COPYING file in the root directory) and Apache 2.0 License
4
4
  // (found in the LICENSE.Apache file in the root directory).
5
5
 
6
+ #include "test_util/sync_point.h"
6
7
  #include "test_util/testharness.h"
7
8
  #include "util/random.h"
8
9
 
9
10
  #ifdef ROCKSDB_LIB_IO_POSIX
10
11
  #include "env/io_posix.h"
12
+ #include "rocksdb/file_system.h"
11
13
 
12
14
  namespace ROCKSDB_NAMESPACE {
13
15
 
@@ -174,6 +176,47 @@ TEST_F(PosixWritableFileTest, SeekAfterExtend) {
174
176
  ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr));
175
177
  }
176
178
 
179
+ #ifdef OS_LINUX
180
+ class PosixDirectoryTest : public testing::Test {};
181
+
182
+ TEST_F(PosixDirectoryTest, BtrfsFsyncFailedOpenDoesNotCloseInvalidFd) {
183
+ // When FsyncWithDirOptions is called with kFileRenamed on a btrfs filesystem
184
+ // and open() fails, close(-1) should not be called. Without the fix,
185
+ // close(-1) is called which is POSIX undefined behavior and overwrites the
186
+ // meaningful open error with a misleading "While closing file after fsync".
187
+ std::shared_ptr<FileSystem> fs = FileSystem::Default();
188
+ std::string dir_path =
189
+ test::PerThreadDBPath("PosixDirectoryTest_BtrfsFsyncFailedOpen");
190
+ ASSERT_OK(fs->CreateDirIfMissing(dir_path, IOOptions(), nullptr));
191
+
192
+ std::unique_ptr<FSDirectory> dir;
193
+ ASSERT_OK(fs->NewDirectory(dir_path, IOOptions(), &dir, nullptr));
194
+
195
+ // Force the btrfs code path via sync point
196
+ SyncPoint::GetInstance()->SetCallBack(
197
+ "PosixDirectory::FsyncWithDirOptions:ForceBtrfs",
198
+ [](void* arg) { *static_cast<bool*>(arg) = true; });
199
+ SyncPoint::GetInstance()->EnableProcessing();
200
+
201
+ // Call FsyncWithDirOptions with a non-existent file for rename sync.
202
+ // open() will fail since the file doesn't exist.
203
+ DirFsyncOptions opts(std::string(dir_path + "/nonexistent_file"));
204
+ IOStatus s = dir->FsyncWithDirOptions(IOOptions(), nullptr, opts);
205
+
206
+ // Should get an error about open failing, NOT about closing
207
+ ASSERT_TRUE(s.IsIOError());
208
+ // The error message should mention "open", not "closing"
209
+ ASSERT_TRUE(s.ToString().find("open") != std::string::npos);
210
+ ASSERT_TRUE(s.ToString().find("closing") == std::string::npos);
211
+
212
+ SyncPoint::GetInstance()->DisableProcessing();
213
+ SyncPoint::GetInstance()->ClearAllCallBacks();
214
+
215
+ ASSERT_OK(dir->Close(IOOptions(), nullptr));
216
+ ASSERT_OK(fs->DeleteDir(dir_path, IOOptions(), nullptr));
217
+ }
218
+ #endif
219
+
177
220
  } // namespace ROCKSDB_NAMESPACE
178
221
  #endif
179
222
 
@@ -351,7 +351,7 @@ void DeleteScheduler::BackgroundEmptyTrash() {
351
351
  auto iter = pending_files_in_buckets_.find(bucket.value());
352
352
  assert(iter != pending_files_in_buckets_.end());
353
353
  if (iter != pending_files_in_buckets_.end()) {
354
- pending_files_in_bucket = iter->second--;
354
+ pending_files_in_bucket = --iter->second;
355
355
  }
356
356
  }
357
357
  }
@@ -7,6 +7,7 @@
7
7
 
8
8
  #include <atomic>
9
9
  #include <cinttypes>
10
+ #include <future>
10
11
  #include <thread>
11
12
  #include <vector>
12
13
 
@@ -856,6 +857,113 @@ TEST_F(DeleteSchedulerTest,
856
857
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
857
858
  }
858
859
 
860
+ TEST_F(DeleteSchedulerTest, BucketSignalOnSingleFileCompletion) {
861
+ // Test that WaitForEmptyTrashBucket wakes up correctly when a single file
862
+ // in a bucket is deleted. This requires the pre-decrement fix: with
863
+ // post-decrement, the pending count is checked before it reaches zero,
864
+ // so the signal is never fired for the bucket, causing a hang.
865
+ //
866
+ // Strategy:
867
+ // 1. Block the background thread inside DeleteTrashFile for file 0 (mu_ is
868
+ // NOT held at this point, since it's unlocked before calling
869
+ // DeleteTrashFile).
870
+ // 2. Start WaitForEmptyTrashBucket(bucket0) on another thread. Since mu_ is
871
+ // free, it acquires mu_, sees bucket0's pending count == 1, enters
872
+ // cv_.Wait() (which releases mu_).
873
+ // 3. Unblock file 0's deletion. The background thread completes
874
+ // DeleteTrashFile, re-acquires mu_, decrements the bucket counter. With
875
+ // the post-decrement bug, pending_files_in_bucket gets 1 (old value),
876
+ // so cv_.SignalAll() is NOT called. The wait thread stays stuck.
877
+ // 4. Also block file 1 inside DeleteTrashFile to keep pending_files_ > 0,
878
+ // preventing the global pending_files_ == 0 check from triggering a
879
+ // signal.
880
+ // 5. Assert the wait thread times out (bug) or completes (fix).
881
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
882
+ NewDeleteScheduler();
883
+
884
+ std::atomic<int> delete_calls{0};
885
+ std::atomic<bool> unblock_file0{false};
886
+ std::atomic<bool> unblock_file1{false};
887
+
888
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
889
+ "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* /*arg*/) {
890
+ int n = delete_calls.fetch_add(1) + 1;
891
+ if (n == 1) {
892
+ // Block file 0's deletion until we've set up the wait thread
893
+ while (!unblock_file0.load()) {
894
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
895
+ }
896
+ } else if (n == 2) {
897
+ // Block file 1's deletion to keep pending_files_ > 0
898
+ while (!unblock_file1.load()) {
899
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
900
+ }
901
+ }
902
+ });
903
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
904
+
905
+ // Bucket 0: single file
906
+ std::optional<int32_t> bucket0 = delete_scheduler_->NewTrashBucket();
907
+ ASSERT_TRUE(bucket0.has_value());
908
+ std::string file0 =
909
+ NewDummyFile("bucket_signal_file0.data", 1024, 0, /*track=*/false);
910
+ ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile(file0, "",
911
+ /*force_bg=*/false,
912
+ bucket0));
913
+
914
+ // Bucket 1: single file (keeps pending_files_ > 0 when bucket 0 empties)
915
+ std::optional<int32_t> bucket1 = delete_scheduler_->NewTrashBucket();
916
+ ASSERT_TRUE(bucket1.has_value());
917
+ std::string file1 =
918
+ NewDummyFile("bucket_signal_file1.data", 1024, 0, /*track=*/false);
919
+ ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile(file1, "",
920
+ /*force_bg=*/false,
921
+ bucket1));
922
+
923
+ // Wait for the background thread to reach the DeleteFile syncpoint for
924
+ // file 0. At this point mu_ is NOT held by the background thread.
925
+ while (delete_calls.load() < 1) {
926
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
927
+ }
928
+
929
+ // Start waiting for bucket 0 on another thread. Since the background thread
930
+ // doesn't hold mu_, this thread acquires mu_, sees iter->second == 1,
931
+ // and enters cv_.Wait() (releasing mu_).
932
+ std::atomic<bool> wait_done{false};
933
+ auto wait_future = std::async(std::launch::async, [&]() {
934
+ delete_scheduler_->WaitForEmptyTrashBucket(bucket0.value());
935
+ wait_done.store(true);
936
+ });
937
+
938
+ // Give the wait thread time to enter cv_.Wait(). We need it to be blocked
939
+ // in cv_.Wait() before unblocking the background thread.
940
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
941
+
942
+ // Unblock file 0's deletion. The background thread will:
943
+ // - Complete DeleteTrashFile, re-acquire mu_
944
+ // - Decrement pending_files_ (2 -> 1) and bucket0 counter (1 -> 0)
945
+ // - With bug (post-decrement): pending_files_in_bucket = 1, no signal fired
946
+ // - With fix (pre-decrement): pending_files_in_bucket = 0, signal fired
947
+ // Then it moves to file 1 and blocks at the second DeleteFile syncpoint.
948
+ unblock_file0.store(true);
949
+
950
+ // Check if WaitForEmptyTrashBucket returns within 5 seconds.
951
+ // With the bug: no signal fired, wait thread stays stuck -> timeout
952
+ // With the fix: signal fired, wait thread wakes up -> completes quickly
953
+ auto status = wait_future.wait_for(std::chrono::seconds(5));
954
+ ASSERT_EQ(status, std::future_status::ready)
955
+ << "WaitForEmptyTrashBucket(bucket0) timed out - bucket signal not fired";
956
+
957
+ // Unblock file 1 and wait for bucket 1
958
+ unblock_file1.store(true);
959
+ delete_scheduler_->WaitForEmptyTrashBucket(bucket1.value());
960
+
961
+ ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
962
+ ASSERT_GE(delete_calls.load(), 2);
963
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
964
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
965
+ }
966
+
859
967
  } // namespace ROCKSDB_NAMESPACE
860
968
 
861
969
  int main(int argc, char** argv) {
@@ -160,6 +160,18 @@ Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts,
160
160
  RecordTick(stats_, PREFETCH_BYTES, read_len);
161
161
  }
162
162
  buf->async_read_in_progress_ = true;
163
+ } else if (s.IsNotSupported()) {
164
+ // Async IO is not available (e.g., io_uring failed to initialize).
165
+ // Fall back to synchronous read so the buffer is populated inline
166
+ // and callers proceed transparently.
167
+ s = reader->Read(opts, start_offset, read_len, &result,
168
+ buf->buffer_.BufferStart(), /*aligned_buf=*/nullptr);
169
+ if (s.ok()) {
170
+ buf->buffer_.Size(buf->CurrentSize() + result.size());
171
+ if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
172
+ RecordTick(stats_, PREFETCH_BYTES, read_len);
173
+ }
174
+ }
163
175
  }
164
176
  return s;
165
177
  }
@@ -351,7 +363,7 @@ void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) {
351
363
  assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset));
352
364
  }
353
365
 
354
- void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
366
+ Status FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
355
367
  BufferInfo* buf = GetFirstBuffer();
356
368
 
357
369
  if (buf->async_read_in_progress_ && fs_ != nullptr) {
@@ -362,7 +374,16 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
362
374
  std::vector<void*> handles;
363
375
  handles.emplace_back(buf->io_handle_);
364
376
  StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
365
- fs_->Poll(handles, 1).PermitUncheckedError();
377
+ IOStatus io_s = fs_->Poll(handles, 1);
378
+ // Allow tests to inject Poll errors
379
+ TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::PollIfNeeded:IOStatus",
380
+ &io_s);
381
+ if (!io_s.ok()) {
382
+ // On Poll failure, clean up the handle and abort.
383
+ // DestroyAndClearIOHandle also sets async_read_in_progress_ to false.
384
+ DestroyAndClearIOHandle(buf);
385
+ return io_s;
386
+ }
366
387
  }
367
388
 
368
389
  // Reset and Release io_handle after the Poll API as request has been
@@ -373,6 +394,7 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
373
394
  // Always call outdated data after Poll as Buffers might be out of sync w.r.t
374
395
  // offset and length.
375
396
  ClearOutdatedData(offset, length);
397
+ return Status::OK();
376
398
  }
377
399
 
378
400
  // ReadAheadSizeTuning API calls readaheadsize_cb_
@@ -511,7 +533,10 @@ Status FilePrefetchBuffer::HandleOverlappingAsyncData(
511
533
  // by Seek, but the next access is at another offset.
512
534
  if (buf->async_read_in_progress_ &&
513
535
  buf->IsOffsetInBufferWithAsyncProgress(offset)) {
514
- PollIfNeeded(offset, length);
536
+ Status poll_status = PollIfNeeded(offset, length);
537
+ if (!poll_status.ok()) {
538
+ return poll_status;
539
+ }
515
540
  }
516
541
 
517
542
  if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
@@ -646,7 +671,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
646
671
  return s;
647
672
  }
648
673
  } else {
649
- PollIfNeeded(tmp_offset, tmp_length);
674
+ Status poll_status = PollIfNeeded(tmp_offset, tmp_length);
675
+ if (!poll_status.ok()) {
676
+ return poll_status;
677
+ }
650
678
  }
651
679
 
652
680
  AllocateBufferIfEmpty();
@@ -93,8 +93,8 @@ struct BufferInfo {
93
93
  //
94
94
  // For example - if end offset of previous buffer was 100 and because of
95
95
  // readahead_size optimization, end_offset was trimmed to 60. Then for next
96
- // prefetch call, start_offset should be intialized to 100 i.e start_offset =
97
- // buf->initial_end_offset_.
96
+ // prefetch call, start_offset should be initialized to 100 i.e start_offset
97
+ // = buf->initial_end_offset_.
98
98
  uint64_t initial_end_offset_ = 0;
99
99
 
100
100
  bool IsDataBlockInBuffer(uint64_t offset, size_t length) {
@@ -155,7 +155,7 @@ enum class FilePrefetchBufferUsage {
155
155
  // When reusing the file system allocated buffer, overlap_buf_ is used if the
156
156
  // main buffer only contains part of the requested data. It is returned to
157
157
  // the caller after the remaining data is fetched.
158
- // If num_buffers_ > 1, then the data is prefetched asynchronosuly in the
158
+ // If num_buffers_ > 1, then the data is prefetched asynchronously in the
159
159
  // buffers whenever the data is consumed from the buffers and that buffer is
160
160
  // freed.
161
161
  // If num_buffers > 1, then requested data can be overlapping between 2 buffers.
@@ -431,7 +431,7 @@ class FilePrefetchBuffer {
431
431
  void ClearOutdatedData(uint64_t offset, size_t len);
432
432
 
433
433
  // It calls Poll API to check for any pending asynchronous request.
434
- void PollIfNeeded(uint64_t offset, size_t len);
434
+ Status PollIfNeeded(uint64_t offset, size_t len);
435
435
 
436
436
  Status PrefetchInternal(const IOOptions& opts, RandomAccessFileReader* reader,
437
437
  uint64_t offset, size_t length, size_t readahead_size,
@@ -178,7 +178,8 @@ IOStatus GenerateOneFileChecksum(
178
178
  std::string* file_checksum_func_name,
179
179
  size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/,
180
180
  std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
181
- const ReadOptions& read_options, Statistics* stats, SystemClock* clock) {
181
+ const ReadOptions& read_options, Statistics* stats, SystemClock* clock,
182
+ const FileOptions& file_options) {
182
183
  if (checksum_factory == nullptr) {
183
184
  return IOStatus::InvalidArgument("Checksum factory is invalid");
184
185
  }
@@ -218,7 +219,12 @@ IOStatus GenerateOneFileChecksum(
218
219
  std::unique_ptr<RandomAccessFileReader> reader;
219
220
  {
220
221
  std::unique_ptr<FSRandomAccessFile> r_file;
221
- io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr);
222
+ FileOptions fopts = file_options;
223
+ if (fopts.file_checksum.empty()) {
224
+ // No expected checksum is known — this is a from-scratch computation.
225
+ fopts.file_checksum_func_name = kNoFileChecksumFuncName;
226
+ }
227
+ io_s = fs->NewRandomAccessFile(file_path, fopts, &r_file, nullptr);
222
228
  if (!io_s.ok()) {
223
229
  return io_s;
224
230
  }
@@ -83,7 +83,8 @@ IOStatus GenerateOneFileChecksum(
83
83
  std::string* file_checksum_func_name,
84
84
  size_t verify_checksums_readahead_size, bool allow_mmap_reads,
85
85
  std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
86
- const ReadOptions& read_options, Statistics* stats, SystemClock* clock);
86
+ const ReadOptions& read_options, Statistics* stats, SystemClock* clock,
87
+ const FileOptions& file_options);
87
88
 
88
89
  inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
89
90
  SystemClock* clock, IOOptions& opts,