@nxtedition/rocksdb 15.4.0 → 15.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (402) hide show
  1. package/binding.cc +24 -19
  2. package/cache.js +1 -1
  3. package/chained-batch.js +12 -3
  4. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  5. package/deps/rocksdb/rocksdb/BUCK +42 -0
  6. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  7. package/deps/rocksdb/rocksdb/Makefile +59 -32
  8. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  9. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  10. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  11. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  12. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  13. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  15. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  19. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  26. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  28. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  29. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  31. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  33. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  34. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  53. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  54. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  55. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  57. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  58. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  59. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  60. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  61. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  62. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  63. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  64. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  65. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  66. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  67. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  68. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  81. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  82. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  83. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  84. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  85. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  86. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  87. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  88. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  89. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  90. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  91. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  92. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  93. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  94. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  95. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  96. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  97. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  98. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  99. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  100. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  101. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  102. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  103. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  105. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  106. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  107. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  110. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  111. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  112. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  113. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  115. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  116. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  118. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  119. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  120. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  121. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  122. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  123. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  124. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  125. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  126. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  127. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  128. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  129. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  130. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  131. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  132. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  133. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  134. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  135. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  136. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  137. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  138. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  139. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  140. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  141. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  142. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  143. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  144. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  145. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  146. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  147. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  148. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  150. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  151. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  152. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  153. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  160. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  161. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  162. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  163. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  164. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  165. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  166. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  167. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  168. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  169. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  171. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  173. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  174. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  175. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  176. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  177. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  180. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  181. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  182. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  183. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  184. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  185. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  187. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  188. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  189. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  192. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  194. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  197. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  198. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  199. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  200. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  202. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  204. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  205. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  206. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  210. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  211. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  212. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  213. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  214. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  215. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  216. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  217. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  218. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  219. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  220. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  221. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  222. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  223. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  224. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  225. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  226. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  227. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  228. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  229. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  230. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  231. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  232. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  233. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  234. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  235. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  236. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  237. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  238. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  239. package/deps/rocksdb/rocksdb/src.mk +12 -0
  240. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  241. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  243. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  253. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  254. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  255. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  256. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  257. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  258. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  259. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  260. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  261. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  263. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  264. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  265. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  266. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  267. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  268. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  269. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  270. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  273. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  274. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  275. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  276. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  277. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  278. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  279. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  280. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  281. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  282. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  283. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  284. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  286. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  287. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  288. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  289. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  290. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  291. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  292. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  293. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  294. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  295. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  296. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  297. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  298. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  299. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  300. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  301. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  302. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  303. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  304. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  305. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  306. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  307. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  308. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  309. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  310. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  311. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  312. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  313. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  314. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  315. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  316. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  317. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  318. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  319. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  320. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  321. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  322. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  323. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  324. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  325. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  326. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  328. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  329. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  331. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  332. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  333. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  334. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  335. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  336. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  337. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  338. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  339. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  340. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  341. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  342. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  343. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  344. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  355. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  356. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  358. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  360. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  361. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  362. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  364. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  365. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  366. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  367. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  368. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  369. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  370. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  371. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  373. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  375. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  376. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  377. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  378. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  380. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  381. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  388. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  389. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  390. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  391. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  392. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  393. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  394. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  395. package/deps/rocksdb/rocksdb.gyp +7 -0
  396. package/index.js +11 -2
  397. package/iterator.js +15 -7
  398. package/package.json +1 -1
  399. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  400. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  402. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -0,0 +1,358 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+
6
+ #pragma once
7
+
8
+ #include <atomic>
9
+ #include <functional>
10
+ #include <memory>
11
+ #include <unordered_map>
12
+ #include <unordered_set>
13
+ #include <vector>
14
+
15
+ #include "rocksdb/options.h"
16
+ #include "rocksdb/rocksdb_namespace.h"
17
+ #include "rocksdb/status.h"
18
+
19
+ namespace ROCKSDB_NAMESPACE {
20
+
21
+ class FileSystem;
22
+ class Statistics;
23
+
24
+ // Forward declaration for internal implementation
25
+ struct IODispatcherImplData;
26
+ struct PendingPrefetchRequest;
27
+
28
+ // Options for configuring IODispatcher behavior
29
+ struct IODispatcherOptions {
30
+ // Maximum memory (in bytes) for prefetching across all ReadSets.
31
+ // When this limit is reached, SubmitJob() blocks until memory is released.
32
+ // Set to 0 (default) for unlimited prefetch memory.
33
+ size_t max_prefetch_memory_bytes = 0;
34
+
35
+ // Optional statistics for tracking memory limiter metrics
36
+ Statistics* statistics = nullptr;
37
+ };
38
+
39
+ /*
40
+ * IODispatcher is a class that allows users to submit groups of IO jobs to be
41
+ * dispatched asynchronously (or synchronously), upon submission the
42
+ * IODispatcher will return a ReadSet which act as an ownership object of those
43
+ * IOs. Users read from their readset when they require the data, and either
44
+ * poll for completion of the block, or read synchronously if the block is not
45
+ * in cache at that point.
46
+ *
47
+ * ReadSets have RAII semantics, meaning on destruction they will cancel any on
48
+ * going IO, and release the underlying pinned blocks.
49
+ *
50
+ * IODispatcher main goal is to act as control plane for all readers using the
51
+ * dispatcher, allowing for future ratelimiting and smarter dispatching policies
52
+ * in the future.
53
+ *
54
+ * Example 1: Basic Usage
55
+ * ----------------------
56
+ * // Submitting an IO job and reading blocks:
57
+ * //
58
+ * // std::shared_ptr<IOJob> job = std::make_shared<IOJob>();
59
+ * // job->table = table_reader; // Provided BlockBasedTable*
60
+ * // job->job_options.io_coalesce_threshold = 32 * 1024;
61
+ * // job->job_options.read_options = read_options; // Provided ReadOptions
62
+ * //
63
+ * // // Populate the job with block handles (e.g., from an index/iterator)
64
+ * // job->block_handles.push_back(handle1);
65
+ * // job->block_handles.push_back(handle2);
66
+ * // job->block_handles.push_back(handle3);
67
+ * //
68
+ * // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
69
+ * // std::shared_ptr<ReadSet> read_set;
70
+ * // Status s = dispatcher->SubmitJob(job, &read_set);
71
+ * // if (!s.ok()) {
72
+ * // // Handle submit error
73
+ * // }
74
+ * //
75
+ * // // Read by index
76
+ * // for (size_t i = 1; i < job->block_handles.size(); ++i) {
77
+ * // CachableEntry<Block> block_entry;
78
+ * // Status rs = read_set->ReadIndex(i, &block_entry);
79
+ * // if (!rs.ok()) {
80
+ * // // Handle read error
81
+ * // continue;
82
+ * // }
83
+ * // // Use block_entry (block contents are pinned here)
84
+ * // }
85
+ * //
86
+ * // // Or read by byte offset
87
+ * // {
88
+ * // size_t offset =
89
+ static_cast<size_t>(job->block_handles.front().offset());
90
+ * // CachableEntry<Block> block_entry;
91
+ * // Status rs = read_set->ReadOffset(offset, &block_entry);
92
+ * // if (rs.ok()) {
93
+ * // // Use block_entry
94
+ * // }
95
+ * // }
96
+ * //
97
+ * // // Stats
98
+ * // uint64_t cache_hits = read_set->GetNumCacheHits();
99
+ * // uint64_t async_reads = read_set->GetNumAsyncReads();
100
+ * // uint64_t sync_reads = read_set->GetNumSyncReads();
101
+ *
102
+ * Example 2: Memory-Limited Prefetching
103
+ * -------------------------------------
104
+ * // Configure a memory budget for prefetching to prevent unbounded memory use.
105
+ * // When the budget is exceeded, IODispatcher uses "partial prefetch":
106
+ * // - Dispatches as many blocks as fit in available memory (earlier first)
107
+ * // - Queues remaining blocks for later dispatch when memory is released
108
+ * // - Never blocks on SubmitJob - remaining blocks are read on-demand
109
+ * //
110
+ * // IODispatcherOptions opts;
111
+ * // opts.max_prefetch_memory_bytes = 64 * 1024 * 1024; // 64MB budget
112
+ * // opts.statistics = db_options.statistics.get(); // Optional metrics
113
+ * //
114
+ * // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
115
+ * //
116
+ * // // Submit a job that needs more memory than available
117
+ * // // Partial prefetch will dispatch what fits immediately
118
+ * // std::shared_ptr<ReadSet> read_set;
119
+ * // Status s = dispatcher->SubmitJob(job, &read_set); // Never blocks
120
+ * //
121
+ * // // Read blocks in order - earlier blocks are more likely to be prefetched
122
+ * // for (size_t i = 0; i < job->block_handles.size(); ++i) {
123
+ * // CachableEntry<Block> block;
124
+ * // Status rs = read_set->ReadIndex(i, &block);
125
+ * // // Use block...
126
+ * //
127
+ * // // Release block when done to free memory for pending prefetches
128
+ * // read_set->ReleaseBlock(i); // Triggers dispatch of queued blocks
129
+ * // }
130
+ * //
131
+ * // Memory limiting statistics (when statistics is configured):
132
+ * // - PREFETCH_MEMORY_BYTES_GRANTED: Total bytes acquired for prefetching
133
+ * // - PREFETCH_MEMORY_BYTES_RELEASED: Total bytes released after use
134
+ * // - PREFETCH_MEMORY_REQUESTS_BLOCKED: Number of blocks that couldn't be
135
+ * // prefetched immediately due to memory pressure
136
+
137
+ */
138
+
139
+ class BlockHandle;
140
+ struct ReadOptions;
141
+ struct AsyncIOState;
142
+
143
+ template <typename T>
144
+ class CachableEntry;
145
+ class Block;
146
+ class BlockBasedTable;
147
+
148
+ struct JobOptions {
149
+ uint64_t io_coalesce_threshold = 16 * 1024;
150
+ ReadOptions read_options;
151
+ };
152
+
153
+ class IOJob {
154
+ public:
155
+ std::vector<BlockHandle> block_handles;
156
+
157
+ // Table reader for accessing block cache and index
158
+ BlockBasedTable* table = nullptr;
159
+
160
+ // Job execution options
161
+ JobOptions job_options;
162
+ };
163
+
164
+ /*
165
+ * ReadSet represents a set of blocks that may be in cache, being read
166
+ * asynchronously, or need to be read synchronously. The Read() method
167
+ * transparently handles all three cases.
168
+ */
169
+ class ReadSet {
170
+ public:
171
+ ReadSet() = default;
172
+ ~ReadSet();
173
+
174
+ ReadSet(const ReadSet&) = delete;
175
+ ReadSet& operator=(const ReadSet&) = delete;
176
+ ReadSet(ReadSet&&) noexcept = delete;
177
+ ReadSet& operator=(ReadSet&&) noexcept = delete;
178
+
179
+ // Read a block by index
180
+ // - If the block is in cache, returns it immediately
181
+ // - If the block is being read asynchronously, polls for completion and
182
+ // returns it
183
+ // - If the block needs to be read, performs a synchronous read and returns it
184
+ //
185
+ // block_index: Index into the original IOJob's block_handles vector
186
+ // out: Output parameter for the pinned block entry
187
+ //
188
+ // Returns: Status::OK() on success, error status otherwise
189
+ Status ReadIndex(size_t block_index, CachableEntry<Block>* out);
190
+ // Read a block by offset
191
+ // - If the block is in cache, returns it immediately
192
+ // - If the block is being read asynchronously, polls for completion and
193
+ // returns it
194
+ // - If the block needs to be read, performs a synchronous read and returns it
195
+
196
+ // block_offset: Byte Offset into the SST file of the block.
197
+
198
+ // out: Output parameter for the pinned block entry
199
+ Status ReadOffset(size_t offset, CachableEntry<Block>* out);
200
+
201
+ // Release a block by index, unpinning it from cache.
202
+ // After this call, ReadIndex() for this block will return an error.
203
+ // This is useful for eager memory reclamation when blocks are no longer
204
+ // needed.
205
+ void ReleaseBlock(size_t block_index);
206
+
207
+ // Check if a block at the given index is still available (not released).
208
+ // Returns true if the block can be read, false otherwise.
209
+ bool IsBlockAvailable(size_t block_index) const;
210
+
211
+ // Statistics accessors
212
+ uint64_t GetNumSyncReads() const { return num_sync_reads_; }
213
+ uint64_t GetNumAsyncReads() const { return num_async_reads_; }
214
+ uint64_t GetNumCacheHits() const { return num_cache_hits_; }
215
+
216
+ private:
217
+ friend class IODispatcherImpl;
218
+
219
+ // Job data
220
+ std::shared_ptr<IOJob> job_;
221
+
222
+ // FileSystem for calling AbortIO in destructor
223
+ std::shared_ptr<FileSystem> fs_;
224
+
225
+ // Storage for pinned blocks (one per block handle in the job)
226
+ std::vector<CachableEntry<Block>> pinned_blocks_;
227
+
228
+ // Sorted index for binary search in ReadOffset.
229
+ // sorted_block_indices_[i] is the original index of the i-th smallest block
230
+ // by offset. Built once during SubmitJob for O(log n) ReadOffset lookups.
231
+ std::vector<size_t> sorted_block_indices_;
232
+
233
+ // Map from block index to async IO state for blocks being read
234
+ // asynchronously. Multiple block indices may map to the same async state when
235
+ // blocks are coalesced into a single IO request.
236
+ std::unordered_map<size_t, std::shared_ptr<AsyncIOState>> async_io_map_;
237
+
238
+ // For memory release notifications back to dispatcher (weak ref to avoid
239
+ // cycles)
240
+ std::weak_ptr<IODispatcherImplData> dispatcher_data_;
241
+
242
+ // Size of each block (parallel to pinned_blocks_) for memory accounting
243
+ std::vector<size_t> block_sizes_;
244
+
245
+ // Statistics counters
246
+ std::atomic<uint64_t> num_sync_reads_ = 0;
247
+ std::atomic<uint64_t> num_async_reads_ = 0;
248
+ std::atomic<uint64_t> num_cache_hits_ = 0;
249
+
250
+ // Poll and process a specific async IO request
251
+ Status PollAndProcessAsyncIO(
252
+ const std::shared_ptr<AsyncIOState>& async_state);
253
+
254
+ // Perform synchronous read for a specific block
255
+ Status SyncRead(size_t block_index);
256
+
257
+ // Remove a block from pending prefetch (called by ReadIndex/ReleaseBlock)
258
+ void RemoveFromPending(size_t block_index);
259
+
260
+ // Atomic flags indicating if block is pending prefetch (lock-free check)
261
+ std::unique_ptr<std::atomic<bool>[]> pending_prefetch_flags_;
262
+ size_t pending_prefetch_flags_size_ = 0;
263
+
264
+ // Reference to pending request (for removal notification)
265
+ std::shared_ptr<PendingPrefetchRequest> pending_request_;
266
+ };
267
+
268
+ /*
269
+ * IODispatcher handles IO operations synchronously or asynchronously based
270
+ * on JobOptions. When async is true, it uses ReadAsync; when false, it uses
271
+ * standard synchronous reads.
272
+ * */
273
+ class IODispatcher {
274
+ protected:
275
+ IODispatcher() = default;
276
+
277
+ public:
278
+ virtual ~IODispatcher() {}
279
+
280
+ IODispatcher(const IODispatcher&) = delete;
281
+ IODispatcher& operator=(const IODispatcher&) = delete;
282
+ IODispatcher(IODispatcher&&) = delete;
283
+ IODispatcher& operator=(IODispatcher&&) = delete;
284
+
285
+ // Submit a job for IO processing
286
+ // job: The IO job to submit
287
+ // read_set: Output parameter that will be populated with the ReadSet on
288
+ // success Returns: Status::OK() on success, error status otherwise
289
+ virtual Status SubmitJob(const std::shared_ptr<IOJob>& job,
290
+ std::shared_ptr<ReadSet>* read_set) = 0;
291
+ };
292
+
293
+ // Create IODispatcher with default options (no memory limit)
294
+ IODispatcher* NewIODispatcher();
295
+
296
+ // Create IODispatcher with custom options
297
+ IODispatcher* NewIODispatcher(const IODispatcherOptions& options);
298
+
299
+ // TrackingIODispatcher wraps another IODispatcher and tracks all ReadSets
300
+ // created. This is useful for testing to verify IO statistics.
301
+ class TrackingIODispatcher : public IODispatcher {
302
+ public:
303
+ TrackingIODispatcher() : impl_(NewIODispatcher()) {}
304
+ explicit TrackingIODispatcher(IODispatcher* impl) : impl_(impl) {}
305
+
306
+ Status SubmitJob(const std::shared_ptr<IOJob>& job,
307
+ std::shared_ptr<ReadSet>* read_set) override {
308
+ Status s = impl_->SubmitJob(job, read_set);
309
+ if (s.ok() && read_set && *read_set) {
310
+ read_sets_.push_back(*read_set);
311
+ }
312
+ return s;
313
+ }
314
+
315
+ // Get all ReadSets created by this dispatcher
316
+ const std::vector<std::shared_ptr<ReadSet>>& GetReadSets() const {
317
+ return read_sets_;
318
+ }
319
+
320
+ // Get aggregated statistics from all ReadSets
321
+ uint64_t GetTotalSyncReads() const {
322
+ uint64_t total = 0;
323
+ for (const auto& rs : read_sets_) {
324
+ total += rs->GetNumSyncReads();
325
+ }
326
+ return total;
327
+ }
328
+
329
+ uint64_t GetTotalAsyncReads() const {
330
+ uint64_t total = 0;
331
+ for (const auto& rs : read_sets_) {
332
+ total += rs->GetNumAsyncReads();
333
+ }
334
+ return total;
335
+ }
336
+
337
+ uint64_t GetTotalCacheHits() const {
338
+ uint64_t total = 0;
339
+ for (const auto& rs : read_sets_) {
340
+ total += rs->GetNumCacheHits();
341
+ }
342
+ return total;
343
+ }
344
+
345
+ // Get total IO operations (sum of all types)
346
+ uint64_t GetTotalIOOperations() const {
347
+ return GetTotalSyncReads() + GetTotalAsyncReads() + GetTotalCacheHits();
348
+ }
349
+
350
+ // Clear tracked ReadSets
351
+ void ClearReadSets() { read_sets_.clear(); }
352
+
353
+ private:
354
+ std::unique_ptr<IODispatcher> impl_;
355
+ std::vector<std::shared_ptr<ReadSet>> read_sets_;
356
+ };
357
+
358
+ } // namespace ROCKSDB_NAMESPACE
@@ -38,6 +38,10 @@ struct FileIOByTemperature {
38
38
  uint64_t cold_file_bytes_read;
39
39
  // the number of bytes read to Temperature::kIce file
40
40
  uint64_t ice_file_bytes_read;
41
+ // the number of bytes read to Temperature::kUnknown file not in last level
42
+ uint64_t unknown_non_last_level_bytes_read;
43
+ // the number of bytes read to Temperature::kUnknown file in last level
44
+ uint64_t unknown_last_level_bytes_read;
41
45
  // total number of reads to Temperature::kHot file
42
46
  uint64_t hot_file_read_count;
43
47
  // total number of reads to Temperature::kWarm file
@@ -48,6 +52,11 @@ struct FileIOByTemperature {
48
52
  uint64_t cold_file_read_count;
49
53
  // total number of reads to Temperature::kIce file
50
54
  uint64_t ice_file_read_count;
55
+ // total number of reads to Temperature::kUnknown file not in last level
56
+ uint64_t unknown_non_last_level_read_count;
57
+ // total number of reads to Temperature::kUnknown file in last level
58
+ uint64_t unknown_last_level_read_count;
59
+
51
60
  // reset all the statistics to 0.
52
61
  void Reset() {
53
62
  hot_file_bytes_read = 0;
@@ -55,11 +64,15 @@ struct FileIOByTemperature {
55
64
  cool_file_bytes_read = 0;
56
65
  cold_file_bytes_read = 0;
57
66
  ice_file_bytes_read = 0;
67
+ unknown_non_last_level_bytes_read = 0;
68
+ unknown_last_level_bytes_read = 0;
58
69
  hot_file_read_count = 0;
59
70
  warm_file_read_count = 0;
60
71
  cool_file_read_count = 0;
61
72
  cold_file_read_count = 0;
62
73
  ice_file_read_count = 0;
74
+ unknown_non_last_level_read_count = 0;
75
+ unknown_last_level_read_count = 0;
63
76
  }
64
77
  };
65
78
 
@@ -201,6 +201,7 @@ enum class BackgroundErrorReason {
201
201
  kManifestWrite,
202
202
  kFlushNoWAL,
203
203
  kManifestWriteNoWAL,
204
+ kAsyncFileOpen,
204
205
  };
205
206
 
206
207
  struct WriteStallInfo {
@@ -488,6 +489,9 @@ struct CompactionJobInfo {
488
489
  // Information about blob files deleted during compaction in Integrated
489
490
  // BlobDB.
490
491
  std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
492
+
493
+ // Whether this compaction was aborted via AbortAllCompactions()
494
+ bool aborted = false;
491
495
  };
492
496
 
493
497
  struct MemTableInfo {
@@ -550,6 +554,34 @@ struct IOErrorInfo {
550
554
  uint64_t offset;
551
555
  };
552
556
 
557
+ // EXPERIMENTAL — under active development, fields may change.
558
+ // Point-in-time snapshot of background job pressure for one DB: how busy
559
+ // compaction and flush are, and how close the DB is to write-stalling.
560
+ struct BackgroundJobPressure {
561
+ // Compaction scheduling (LOW + BOTTOM priority combined)
562
+ int compaction_scheduled = 0;
563
+ int compaction_running = 0;
564
+
565
+ // Per-priority compaction breakdown
566
+ int compaction_low_scheduled = 0;
567
+ int compaction_low_running = 0;
568
+ int compaction_bottom_scheduled = 0;
569
+ int compaction_bottom_running = 0;
570
+
571
+ // Flush scheduling
572
+ int flush_scheduled = 0;
573
+ int flush_running = 0;
574
+
575
+ // How close the DB is to a write stall, as a percentage (0 = healthy,
576
+ // 100 = at stall threshold). Can exceed 100 when already stalling.
577
+ // Max across all column families based on write-stall triggers.
578
+ int write_stall_proximity_pct = 0;
579
+ // Whether RocksDB has activated compaction speedup due to write pressure
580
+ bool compaction_speedup_active = false;
581
+
582
+ bool operator==(const BackgroundJobPressure&) const = default;
583
+ };
584
+
553
585
  // EventListener class contains a set of callback functions that will
554
586
  // be called when specific RocksDB event happens such as flush. It can
555
587
  // be used as a building block for developing custom features such as
@@ -866,6 +898,17 @@ class EventListener : public Customizable {
866
898
  // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback.
867
899
  virtual void OnIOError(const IOErrorInfo& /*info*/) {}
868
900
 
901
+ // EXPERIMENTAL
902
+ // Called after a flush or compaction background job completes, providing a
903
+ // snapshot of current background job scheduling pressure and write-stall
904
+ // proximity. Fires on the background thread that completed the job, without
905
+ // holding db_mutex_. This callback fires on every completion, even if
906
+ // pressure values have not changed from the previous call.
907
+ // Implementations should not run for an extended period of time before
908
+ // returning, as this blocks RocksDB background work.
909
+ virtual void OnBackgroundJobPressureChanged(
910
+ DB* /*db*/, const BackgroundJobPressure& /*pressure*/) {}
911
+
869
912
  ~EventListener() override {}
870
913
  };
871
914
 
@@ -211,6 +211,26 @@ class MemTableRep {
211
211
  return Status::NotSupported("GetAndValidate() not implemented.");
212
212
  }
213
213
 
214
+ // Batch lookup of multiple sorted keys. For each key, finds the first
215
+ // matching entry and calls callback_func with callback_args[i] and the
216
+ // entry. Continues calling for subsequent entries until callback_func
217
+ // returns false.
218
+ //
219
+ // Keys must be memtable-encoded and in non-decreasing order. Implementations
220
+ // may exploit the sorted key order for more efficient lookups.
221
+ //
222
+ // When detect_key_out_of_order is true, validates key ordering during
223
+ // traversal and returns Corruption if out-of-order keys are found.
224
+ // When key_validation_callback is non-null, calls it on each visited node.
225
+ //
226
+ // Default implementation calls Get() per key via an iterator.
227
+ virtual Status MultiGet(
228
+ size_t num_keys, const char* const* keys, void** callback_args,
229
+ bool (*callback_func)(void* arg, const char* entry),
230
+ bool allow_data_in_errors = false, bool detect_key_out_of_order = false,
231
+ const std::function<Status(const char*, bool)>& key_validation_callback =
232
+ nullptr);
233
+
214
234
  virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
215
235
  const Slice& /*end_key*/) {
216
236
  return 0;
@@ -58,6 +58,7 @@ class InternalKeyComparator;
58
58
  class WalFilter;
59
59
  class FileSystem;
60
60
  class UserDefinedIndexFactory;
61
+ class IODispatcher;
61
62
 
62
63
  struct Options;
63
64
  struct DbPath;
@@ -304,9 +305,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
304
305
  // Dynamically changeable through SetOptions() API
305
306
  uint64_t max_bytes_for_level_base = 256 * 1048576;
306
307
 
307
- // Deprecated.
308
- uint64_t snap_refresh_nanos = 0;
309
-
310
308
  // Disable automatic compactions. Manual compactions can still
311
309
  // be issued on this column family
312
310
  //
@@ -785,6 +783,25 @@ struct DBOptions {
785
783
  // Default: 16
786
784
  int max_file_opening_threads = 16;
787
785
 
786
+ // If true, SST files are opened and validated asynchronously in the
787
+ // background after DB::Open returns. This reduces DB open time for
788
+ // databases with many SST files and high latency file systems. Mostly useful
789
+ // when max_open_files = -1, as max_open_files != -1 usually has fast open
790
+ // times. See also `max_file_opening_threads` and
791
+ // `skip_stats_update_on_db_open` to improve file open latency.
792
+ //
793
+ // Note: This option is currently not compatible with FIFO compaction and
794
+ // requires skip_stats_update_on_db_open=true.
795
+ //
796
+ // Errors will no longer show up in DB::Open, but instead can show up as
797
+ // either background errors and/or operations that access the file (e.g.
798
+ // reads, compactions).
799
+ //
800
+ // When false (default), SST files are opened and validated during DB::Open.
801
+ //
802
+ // Default: false
803
+ bool open_files_async = false;
804
+
788
805
  // Once write-ahead logs exceed this size, we will start forcing the flush of
789
806
  // column families whose memtables are backed by the oldest live WAL file
790
807
  // (i.e. the ones that are causing all the space amplification). If set to 0
@@ -977,6 +994,13 @@ struct DBOptions {
977
994
  // manifest write (e.g. completed DB compaction or flush).
978
995
  uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
979
996
 
997
+ // If true, on DB close, read back the entire MANIFEST file and validate
998
+ // CRC checksums and logical record content. If corruption is detected,
999
+ // a fresh MANIFEST is written from in-memory state before closing.
1000
+ //
1001
+ // This option is mutable with SetDBOptions().
1002
+ bool verify_manifest_content_on_close = false;
1003
+
980
1004
  // This option mostly replaces max_manifest_file_size to control an auto-tuned
981
1005
  // balance of manifest write amplification and space amplification. A new
982
1006
  // manifest file is created with the "compacted" contents of the old one when
@@ -1362,17 +1386,6 @@ struct DBOptions {
1362
1386
  // Default: false
1363
1387
  bool skip_stats_update_on_db_open = false;
1364
1388
 
1365
- // This option is deprecated and marked as no-op. Kept for backward
1366
- // compatibility until usage is fully removed.
1367
- // File size check will be performed through a thread
1368
- // pool during DB Open, when max_open_files is set to -1.
1369
- // Therefore, the concern of DB Open slowness is eliminated.
1370
- // Note that when max_open_files is not set to -1, only a subset of files will
1371
- // be opened and checked during DB Open.
1372
- //
1373
- // Default: false
1374
- bool skip_checking_sst_file_sizes_on_db_open = false;
1375
-
1376
1389
  // Recovery mode to control the consistency while replaying WAL
1377
1390
  // Default: kPointInTimeRecovery
1378
1391
  WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
@@ -1405,9 +1418,30 @@ struct DBOptions {
1405
1418
  // WAL logs will be kept, so that if crash happened before flush, we still
1406
1419
  // have logs to recover from.
1407
1420
  //
1421
+ // Note: when `enforce_write_buffer_manager_during_recovery` is also enabled,
1422
+ // flushes may still occur during recovery to respect the
1423
+ // WriteBufferManager's global memory limit, even if this option is true.
1424
+ // Once any such WBM-triggered flush happens, all remaining memtables will
1425
+ // also be flushed at the end of recovery (similar to the behavior when this
1426
+ // option is false).
1427
+ //
1408
1428
  // DEFAULT: false
1409
1429
  bool avoid_flush_during_recovery = false;
1410
1430
 
1431
+ // If true and a WriteBufferManager is configured, RocksDB will check
1432
+ // WriteBufferManager::ShouldFlush() during WAL recovery and schedule
1433
+ // flushes when needed. This prevents OOM when multiple RocksDB instances
1434
+ // share a WriteBufferManager and one instance is recovering from WAL.
1435
+ //
1436
+ // When triggered, all column families with non-empty memtables are scheduled
1437
+ // for flush, which may produce smaller L0 files in some column families.
1438
+ // This also overrides `avoid_flush_during_recovery`: once a WBM-triggered
1439
+ // flush occurs mid-recovery, all remaining non-empty memtables will be
1440
+ // flushed at the end of recovery as well.
1441
+ //
1442
+ // DEFAULT: true
1443
+ bool enforce_write_buffer_manager_during_recovery = true;
1444
+
1411
1445
  // By default RocksDB will flush all memtables on DB close if there are
1412
1446
  // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
1413
1447
  // DB close. Unpersisted data WILL BE LOST.
@@ -1847,11 +1881,13 @@ class MultiScanArgs {
1847
1881
  io_coalesce_threshold = other.io_coalesce_threshold;
1848
1882
  max_prefetch_size = other.max_prefetch_size;
1849
1883
  use_async_io = other.use_async_io;
1884
+ io_dispatcher = other.io_dispatcher;
1850
1885
  }
1851
1886
  MultiScanArgs(MultiScanArgs&& other) noexcept
1852
1887
  : io_coalesce_threshold(other.io_coalesce_threshold),
1853
1888
  max_prefetch_size(other.max_prefetch_size),
1854
1889
  use_async_io(other.use_async_io),
1890
+ io_dispatcher(std::move(other.io_dispatcher)),
1855
1891
  comp_(other.comp_),
1856
1892
  original_ranges_(std::move(other.original_ranges_)) {}
1857
1893
 
@@ -1861,6 +1897,7 @@ class MultiScanArgs {
1861
1897
  io_coalesce_threshold = other.io_coalesce_threshold;
1862
1898
  max_prefetch_size = other.max_prefetch_size;
1863
1899
  use_async_io = other.use_async_io;
1900
+ io_dispatcher = other.io_dispatcher;
1864
1901
  return *this;
1865
1902
  }
1866
1903
 
@@ -1871,6 +1908,7 @@ class MultiScanArgs {
1871
1908
  io_coalesce_threshold = other.io_coalesce_threshold;
1872
1909
  max_prefetch_size = other.max_prefetch_size;
1873
1910
  use_async_io = other.use_async_io;
1911
+ io_dispatcher = std::move(other.io_dispatcher);
1874
1912
  }
1875
1913
  return *this;
1876
1914
  }
@@ -1918,6 +1956,7 @@ class MultiScanArgs {
1918
1956
  io_coalesce_threshold = other.io_coalesce_threshold;
1919
1957
  max_prefetch_size = other.max_prefetch_size;
1920
1958
  use_async_io = other.use_async_io;
1959
+ io_dispatcher = other.io_dispatcher;
1921
1960
  }
1922
1961
 
1923
1962
  uint64_t io_coalesce_threshold = 16 << 10; // 16KB by default
@@ -1939,6 +1978,12 @@ class MultiScanArgs {
1939
1978
  // When false, it will use synchronous MultiRead().
1940
1979
  bool use_async_io = false;
1941
1980
 
1981
+ // Optional IODispatcher for multi-scan operations.
1982
+ // If nullptr (default), a new IODispatcher is created internally.
1983
+ // Users can provide their own IODispatcher for custom IO scheduling
1984
+ // or for testing/monitoring purposes (e.g., to check IO statistics).
1985
+ std::shared_ptr<IODispatcher> io_dispatcher = nullptr;
1986
+
1942
1987
  private:
1943
1988
  // The comparator used for ordering ranges
1944
1989
  const Comparator* comp_;
@@ -2108,10 +2153,6 @@ struct ReadOptions {
2108
2153
  // that were inserted into the database after the creation of the iterator.
2109
2154
  bool tailing = false;
2110
2155
 
2111
- // This options is not used anymore. It was to turn on a functionality that
2112
- // has been removed. DEPRECATED
2113
- bool managed = false;
2114
-
2115
2156
  // Enable a total order seek regardless of index format (e.g. hash index)
2116
2157
  // used in the table. Some table format (e.g. plain table) may not support
2117
2158
  // this option.
@@ -2241,9 +2282,10 @@ struct ReadOptions {
2241
2282
  // block based table index. The table_factory used for the column family
2242
2283
  // must support building/reading this index.
2243
2284
  //
2244
- // Currently, only forward scans are supported. For forward scans, only Seek()
2245
- // is supported. SeekToFirst() is not supported. If the caller wishes to scan
2246
- // from start to end, the native index must be used.
2285
+ // Forward scans (SeekToFirst, Seek, Next) and point lookups (Get) are
2286
+ // supported. Reverse operations (SeekToLast, SeekForPrev, Prev) are not
2287
+ // yet supported and will return NotSupported when this is set. Leave this
2288
+ // null to use the native index for reverse operations.
2247
2289
  const UserDefinedIndexFactory* table_index_factory = nullptr;
2248
2290
 
2249
2291
  // *** END options only relevant to iterators or scans ***