@nxtedition/rocksdb 15.4.1 → 16.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (401) hide show
  1. package/binding.cc +70 -23
  2. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  3. package/deps/rocksdb/rocksdb/BUCK +42 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  5. package/deps/rocksdb/rocksdb/Makefile +59 -32
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  8. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  9. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  10. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  11. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  14. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  17. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  24. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  26. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  27. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  28. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  29. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  31. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  32. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  51. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  52. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  53. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  54. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  55. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  57. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  58. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  59. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  60. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  61. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  62. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  63. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  64. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  65. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  66. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  79. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  80. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  81. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  82. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  83. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  84. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  85. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  86. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  87. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  88. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  90. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  91. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  92. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  93. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  94. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  95. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  96. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  97. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  98. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  99. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  100. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  101. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  102. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  103. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  104. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  105. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  109. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  111. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  112. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  113. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  114. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  115. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  116. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  117. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  118. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  119. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  120. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  121. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  122. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  123. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  124. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  125. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  126. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  127. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  128. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  129. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  130. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  131. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  132. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  133. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  134. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  135. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  136. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  137. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  138. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  139. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  140. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  141. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  142. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  143. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  144. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  145. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  146. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  147. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  148. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  150. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  151. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  160. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  161. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  162. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  163. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  164. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  165. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  166. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  167. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  168. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  169. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  170. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  171. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  172. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  173. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  174. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  175. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  176. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  177. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  179. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  180. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  181. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  182. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  183. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  185. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  187. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  188. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  189. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  192. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  193. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  194. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  198. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  200. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  202. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  203. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  204. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  205. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  206. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  210. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  211. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  212. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  213. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  214. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  215. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  216. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  217. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  218. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  219. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  220. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  221. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  222. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  223. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  224. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  225. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  226. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  227. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  228. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  229. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  230. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  231. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  232. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  233. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  234. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  235. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  236. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  237. package/deps/rocksdb/rocksdb/src.mk +12 -0
  238. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  239. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  240. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  241. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  242. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  243. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  253. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  254. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  255. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  256. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  257. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  258. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  259. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  260. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  261. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  264. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  265. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  266. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  267. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  268. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  269. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  270. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  273. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  274. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  275. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  276. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  277. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  278. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  279. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  280. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  281. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  282. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  283. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  284. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  286. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  287. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  288. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  289. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  290. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  291. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  292. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  293. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  294. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  295. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  296. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  297. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  298. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  299. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  300. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  301. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  302. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  303. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  304. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  305. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  306. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  307. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  308. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  309. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  310. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  311. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  312. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  313. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  314. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  315. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  316. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  317. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  318. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  319. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  320. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  321. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  322. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  323. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  324. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  325. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  326. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  327. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  328. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  329. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  331. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  332. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  333. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  334. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  335. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  336. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  337. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  338. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  339. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  340. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  341. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  342. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  343. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  344. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  355. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  356. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  358. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  360. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  361. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  362. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  364. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  365. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  366. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  367. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  368. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  369. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  370. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  371. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  373. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  375. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  376. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  377. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  378. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  380. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  381. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  388. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  389. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  390. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  391. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  392. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  393. package/deps/rocksdb/rocksdb.gyp +7 -0
  394. package/index.js +70 -10
  395. package/iterator.js +25 -3
  396. package/max_rev_operator.h +9 -5
  397. package/package.json +1 -1
  398. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  399. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  400. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -0,0 +1,1099 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+
6
+ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
7
+ // This source code is licensed under both the GPLv2 (found in the
8
+ // COPYING file in the root directory) and Apache 2.0 License
9
+ // (found in the LICENSE.Apache file in the root directory).
10
+ //
11
+ // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
12
+ // Use of this source code is governed by a BSD-style license that can be
13
+ // found in the LICENSE file. See the AUTHORS file for names of contributors.
14
+
15
+ #include "util/io_dispatcher_imp.h"
16
+
17
+ #include <deque>
18
+ #include <memory>
19
+ #include <unordered_map>
20
+ #include <unordered_set>
21
+ #include <vector>
22
+
23
+ #include "file/random_access_file_reader.h"
24
+ #include "monitoring/statistics_impl.h"
25
+ #include "port/port.h"
26
+ #include "rocksdb/file_system.h"
27
+ #include "rocksdb/io_dispatcher.h"
28
+ #include "rocksdb/options.h"
29
+ #include "rocksdb/status.h"
30
+ #include "table/block_based/block_based_table_reader.h"
31
+ #include "table/block_based/cachable_entry.h"
32
+ #include "table/block_based/reader_common.h"
33
+ #include "table/format.h"
34
+ #include "test_util/sync_point.h"
35
+ #include "util/mutexlock.h"
36
+
37
+ namespace ROCKSDB_NAMESPACE {
38
+
39
+ // IODispatcherImplData is the base that provides ReleaseMemory interface
40
+ // for ReadSets to call back when releasing blocks. Defined here so it's
41
+ // visible to ReadSet methods.
42
+ struct IODispatcherImplData {
43
+ virtual ~IODispatcherImplData() = default;
44
+ virtual void ReleaseMemory(size_t bytes) = 0;
45
+ };
46
+
47
+ // Helper function to create and pin a block from a buffer
48
+ // Used by both ReadSet::PollAndProcessAsyncIO and IODispatcherImpl::Impl
49
+ static Status CreateAndPinBlockFromBuffer(
50
+ const std::shared_ptr<IOJob>& job, const BlockHandle& block,
51
+ uint64_t buffer_start_offset, const Slice& buffer_data,
52
+ CachableEntry<Block>& pinned_block_entry) {
53
+ auto* rep = job->table->get_rep();
54
+
55
+ // Get decompressor
56
+ UnownedPtr<Decompressor> decompressor = rep->decompressor.get();
57
+ CachableEntry<DecompressorDict> cached_dict;
58
+
59
+ if (rep->uncompression_dict_reader) {
60
+ Status s = rep->uncompression_dict_reader->GetOrReadUncompressionDictionary(
61
+ nullptr, job->job_options.read_options, nullptr, nullptr, &cached_dict);
62
+ if (!s.ok()) {
63
+ return s;
64
+ }
65
+ if (cached_dict.GetValue()) {
66
+ decompressor = cached_dict.GetValue()->decompressor_.get();
67
+ }
68
+ }
69
+
70
+ // Create block from buffer data
71
+ const auto block_size_with_trailer =
72
+ BlockBasedTable::BlockSizeWithTrailer(block);
73
+ const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
74
+
75
+ CacheAllocationPtr data = AllocateBlock(
76
+ block_size_with_trailer, GetMemoryAllocator(rep->table_options));
77
+ memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
78
+ block_size_with_trailer);
79
+ BlockContents tmp_contents(std::move(data), block.size());
80
+
81
+ #ifndef NDEBUG
82
+ tmp_contents.has_trailer = rep->footer.GetBlockTrailerSize() > 0;
83
+ #endif
84
+
85
+ return job->table->CreateAndPinBlockInCache<Block_kData>(
86
+ job->job_options.read_options, block, decompressor, &tmp_contents,
87
+ &pinned_block_entry.As<Block_kData>());
88
+ }
89
+
90
+ // State for async IO operations (implementation detail)
91
+ struct AsyncIOState {
92
+ AsyncIOState() : offset(static_cast<uint64_t>(-1)) {}
93
+ ~AsyncIOState() { read_req.status.PermitUncheckedError(); }
94
+
95
+ AsyncIOState(const AsyncIOState&) = delete;
96
+ AsyncIOState& operator=(const AsyncIOState&) = delete;
97
+ AsyncIOState(AsyncIOState&&) = default;
98
+ AsyncIOState& operator=(AsyncIOState&&) = default;
99
+
100
+ std::unique_ptr<char[]> buf;
101
+ AlignedBuf aligned_buf;
102
+ void* io_handle = nullptr;
103
+ IOHandleDeleter del_fn;
104
+ uint64_t offset;
105
+ std::vector<size_t> block_indices;
106
+ std::vector<BlockHandle> blocks;
107
+ FSReadRequest read_req;
108
+ };
109
+
110
+ // ReadSet destructor - clean up IO handles
111
+ // Must call AbortIO before deleting handles to avoid use-after-free when
112
+ // io_uring completions arrive for deleted handles.
113
+ ReadSet::~ReadSet() {
114
+ // Release memory for any blocks still pinned
115
+ // Note: block_sizes_[i] is only set for async IO reads where memory
116
+ // limiting applies. For sync reads, block_sizes_ remains 0, so this
117
+ // loop is effectively a no-op for sync reads.
118
+ if (auto dispatcher_data = dispatcher_data_.lock()) {
119
+ for (size_t i = 0; i < block_sizes_.size(); ++i) {
120
+ if (block_sizes_[i] > 0 && pinned_blocks_[i].GetValue()) {
121
+ dispatcher_data->ReleaseMemory(block_sizes_[i]);
122
+ }
123
+ }
124
+ }
125
+
126
+ if (async_io_map_.empty()) {
127
+ return;
128
+ }
129
+
130
+ // Collect unique pending IO handles (multiple block indices may share the
131
+ // same async_state due to coalescing)
132
+ std::vector<void*> pending_handles;
133
+ std::unordered_set<void*> seen_handles;
134
+ for (auto& pair : async_io_map_) {
135
+ auto& async_state = pair.second;
136
+ if (async_state->io_handle != nullptr &&
137
+ seen_handles.find(async_state->io_handle) == seen_handles.end()) {
138
+ pending_handles.push_back(async_state->io_handle);
139
+ seen_handles.insert(async_state->io_handle);
140
+ }
141
+ }
142
+
143
+ // Abort all pending IO operations before deleting handles
144
+ if (!pending_handles.empty() && fs_) {
145
+ // AbortIO cancels pending requests and waits for completions
146
+ IOStatus s = fs_->AbortIO(pending_handles);
147
+ (void)s; // Ignore errors in destructor
148
+ }
149
+
150
+ // Now safe to delete the handles
151
+ for (auto& pair : async_io_map_) {
152
+ auto& async_state = pair.second;
153
+ if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
154
+ async_state->del_fn(async_state->io_handle);
155
+ async_state->io_handle = nullptr;
156
+ }
157
+ }
158
+ }
159
+
160
+ // Main Read() method - transparently handles cache, async IO, and sync reads
161
+ Status ReadSet::ReadIndex(size_t block_index, CachableEntry<Block>* out) {
162
+ // Bounds check
163
+ if (block_index >= pinned_blocks_.size()) {
164
+ return Status::InvalidArgument("Block index out of range");
165
+ }
166
+
167
+ // Case 1: Block is already available (from cache or sync read during
168
+ // SubmitJob)
169
+ if (pinned_blocks_[block_index].GetValue()) {
170
+ *out = std::move(pinned_blocks_[block_index]);
171
+ // Release memory accounting for prefetched blocks. After moving the value
172
+ // out, ReleaseBlock() and the destructor check pinned_blocks_.GetValue()
173
+ // which will be null, so they won't release memory again.
174
+ if (block_index < block_sizes_.size() && block_sizes_[block_index] > 0) {
175
+ if (auto dispatcher_data = dispatcher_data_.lock()) {
176
+ dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
177
+ }
178
+ block_sizes_[block_index] = 0;
179
+ }
180
+ // Note: Statistics for this block were already counted during SubmitJob
181
+ // (either as cache hit or sync read)
182
+ return Status::OK();
183
+ }
184
+
185
+ // Case 2: Block has async IO in progress - poll and process
186
+ if (job_->job_options.read_options.async_io) {
187
+ auto it = async_io_map_.find(block_index);
188
+ if (it != async_io_map_.end()) {
189
+ // Get the number of blocks in this coalesced async request BEFORE polling
190
+ // (since PollAndProcessAsyncIO will remove entries from the map)
191
+ size_t num_blocks_in_request = it->second->block_indices.size();
192
+
193
+ if (Status s = PollAndProcessAsyncIO(it->second); !s.ok()) {
194
+ return s;
195
+ }
196
+ // Count all blocks that were read in this async request
197
+ num_async_reads_ += num_blocks_in_request;
198
+
199
+ // After polling, the block should be in pinned_blocks_
200
+ if (pinned_blocks_[block_index].GetValue()) {
201
+ *out = std::move(pinned_blocks_[block_index]);
202
+ // Release memory accounting (same as case 1 above)
203
+ if (block_index < block_sizes_.size() &&
204
+ block_sizes_[block_index] > 0) {
205
+ if (auto dispatcher_data = dispatcher_data_.lock()) {
206
+ dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
207
+ }
208
+ block_sizes_[block_index] = 0;
209
+ }
210
+ return Status::OK();
211
+ }
212
+
213
+ return Status::IOError("Failed to process async IO result");
214
+ }
215
+ }
216
+
217
+ // Case 3: Block needs synchronous read (pending or never-dispatched blocks).
218
+ // No ReleaseMemory() needed here because blocks reaching this path never had
219
+ // TryAcquireMemory() called — they were either pending prefetch or skipped
220
+ // during SubmitJob. block_sizes_[block_index] may be > 0 (set during
221
+ // SubmitJob for all uncached blocks) but that does not imply memory was
222
+ // acquired.
223
+ RemoveFromPending(block_index);
224
+
225
+ Status s = SyncRead(block_index);
226
+ if (s.ok()) {
227
+ *out = std::move(pinned_blocks_[block_index]);
228
+ num_sync_reads_++;
229
+ }
230
+ return s;
231
+ }
232
+
233
+ Status ReadSet::ReadOffset(size_t offset, CachableEntry<Block>* out) {
234
+ if (sorted_block_indices_.empty()) {
235
+ return Status::InvalidArgument("ReadSet not initialized");
236
+ }
237
+
238
+ // Use binary search on the sorted index to find the block containing offset.
239
+ // sorted_block_indices_ contains original indices sorted by block offset.
240
+ const auto& block_handles = job_->block_handles;
241
+
242
+ // Binary search for the first block whose offset is > offset, then back up
243
+ auto it = std::upper_bound(sorted_block_indices_.begin(),
244
+ sorted_block_indices_.end(), offset,
245
+ [&block_handles](size_t off, size_t idx) {
246
+ return off < block_handles[idx].offset();
247
+ });
248
+
249
+ // If it == begin(), offset is before all blocks
250
+ if (it == sorted_block_indices_.begin()) {
251
+ return Status::InvalidArgument("Offset not found in any block");
252
+ }
253
+
254
+ // Back up to the candidate block (largest offset <= our offset)
255
+ --it;
256
+ size_t candidate_idx = *it;
257
+ const auto& handle = block_handles[candidate_idx];
258
+
259
+ // Check if offset falls within this block
260
+ if (offset >= handle.offset() && offset < (handle.offset() + handle.size())) {
261
+ return ReadIndex(candidate_idx, out);
262
+ }
263
+
264
+ return Status::InvalidArgument("Offset not found in any block");
265
+ }
266
+
267
+ void ReadSet::ReleaseBlock(size_t block_index) {
268
+ if (block_index >= pinned_blocks_.size()) {
269
+ return;
270
+ }
271
+
272
+ // Remove from pending if applicable
273
+ RemoveFromPending(block_index);
274
+
275
+ // Release memory BEFORE unpinning
276
+ // Note: block_sizes_[idx] is only set for async IO reads where memory
277
+ // limiting applies. For sync reads, block_sizes_ remains 0, so this
278
+ // check implicitly skips ReleaseMemory for sync reads.
279
+ if (pinned_blocks_[block_index].GetValue() &&
280
+ block_index < block_sizes_.size() && block_sizes_[block_index] > 0) {
281
+ if (auto dispatcher_data = dispatcher_data_.lock()) {
282
+ dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
283
+ }
284
+ block_sizes_[block_index] = 0; // Prevent double-release
285
+ }
286
+
287
+ // Unpin the block from cache
288
+ pinned_blocks_[block_index].Reset();
289
+ // Clean up any pending async IO for this block
290
+ async_io_map_.erase(block_index);
291
+ }
292
+
293
+ bool ReadSet::IsBlockAvailable(size_t block_index) const {
294
+ if (block_index >= pinned_blocks_.size()) {
295
+ return false;
296
+ }
297
+ // Block is available if it hasn't been released (still has a value or
298
+ // has pending async IO)
299
+ return pinned_blocks_[block_index].GetValue() != nullptr ||
300
+ async_io_map_.find(block_index) != async_io_map_.end();
301
+ }
302
+
303
+ // Poll and process async IO for a specific block
304
+ Status ReadSet::PollAndProcessAsyncIO(
305
+ const std::shared_ptr<AsyncIOState>& async_state) {
306
+ auto* rep = job_->table->get_rep();
307
+
308
+ // Poll for IO completion using FileSystem Poll API
309
+ std::vector<void*> io_handles = {async_state->io_handle};
310
+ IOStatus io_s = rep->ioptions.env->GetFileSystem()->Poll(io_handles, 1);
311
+ if (!io_s.ok()) {
312
+ return io_s;
313
+ }
314
+
315
+ // Check for read errors
316
+ if (!async_state->read_req.status.ok()) {
317
+ return async_state->read_req.status;
318
+ }
319
+
320
+ // Use the result slice from the callback which has been correctly set
321
+ // with any necessary alignment adjustments for direct IO
322
+ const Slice& buffer_data = async_state->read_req.result;
323
+
324
+ // Process all blocks in this async request
325
+ for (size_t i = 0; i < async_state->block_indices.size(); ++i) {
326
+ const size_t idx = async_state->block_indices[i];
327
+ const auto& block_handle = async_state->blocks[i];
328
+
329
+ Status s =
330
+ CreateAndPinBlockFromBuffer(job_, block_handle, async_state->offset,
331
+ buffer_data, pinned_blocks_[idx]);
332
+ if (!s.ok()) {
333
+ return s;
334
+ }
335
+ }
336
+
337
+ // Clean up IO handle
338
+ if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
339
+ async_state->del_fn(async_state->io_handle);
340
+ async_state->io_handle = nullptr;
341
+ }
342
+
343
+ // Remove from map - all blocks in this request have been processed
344
+ // Store indices in a temporary vector to avoid iterator invalidation
345
+ std::vector<size_t> indices_to_remove = async_state->block_indices;
346
+ for (const auto idx : indices_to_remove) {
347
+ async_io_map_.erase(idx);
348
+ }
349
+
350
+ return Status::OK();
351
+ }
352
+
353
+ // Perform synchronous read for a specific block
354
+ // This performs a direct synchronous read from disk when the block is not in
355
+ // cache
356
+ Status ReadSet::SyncRead(size_t block_index) {
357
+ const auto& block_handle = job_->block_handles[block_index];
358
+ auto* rep = job_->table->get_rep();
359
+
360
+ // Get dictionary-aware decompressor if available
361
+ UnownedPtr<Decompressor> decompressor = rep->decompressor.get();
362
+ CachableEntry<DecompressorDict> cached_dict;
363
+ if (rep->uncompression_dict_reader) {
364
+ Status s = rep->uncompression_dict_reader->GetOrReadUncompressionDictionary(
365
+ nullptr, job_->job_options.read_options, nullptr, nullptr,
366
+ &cached_dict);
367
+ if (!s.ok()) {
368
+ return s;
369
+ }
370
+ if (cached_dict.GetValue()) {
371
+ decompressor = cached_dict.GetValue()->decompressor_.get();
372
+ }
373
+ }
374
+
375
+ return job_->table->RetrieveBlock<Block_kData>(
376
+ /*prefetch_buffer=*/nullptr, job_->job_options.read_options, block_handle,
377
+ decompressor, &pinned_blocks_[block_index].As<Block_kData>(),
378
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr,
379
+ /*for_compaction=*/false, /*use_cache=*/true,
380
+ /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
381
+ }
382
+
383
+ // A pre-coalesced group of blocks for prefetching
384
+ struct CoalescedPrefetchGroup {
385
+ std::vector<size_t> block_indices; // Blocks in this group (sorted by offset)
386
+ size_t total_bytes = 0; // Total bytes for this IO
387
+ };
388
+
389
+ // State for a pending memory request waiting to be granted
390
+ // Groups are pre-coalesced at queue time for efficient dispatch
391
+ struct PendingPrefetchRequest {
392
+ std::weak_ptr<ReadSet> read_set;
393
+ std::shared_ptr<IOJob> job;
394
+
395
+ // Pre-coalesced groups ready for dispatch (ordered by first block index)
396
+ std::deque<CoalescedPrefetchGroup> coalesced_groups;
397
+
398
+ // Individual block indices still pending (for RemoveFromPending lookup)
399
+ std::unordered_set<size_t> block_indices_to_prefetch;
400
+
401
+ std::atomic<size_t> pending_bytes_{0}; // Track remaining bytes
402
+ mutable port::Mutex groups_mutex_; // Protects groups and set modifications
403
+ };
404
+
405
+ // Remove a block from pending prefetch (called when block is read or released)
406
+ void ReadSet::RemoveFromPending(size_t block_index) {
407
+ if (!pending_prefetch_flags_ || block_index >= pending_prefetch_flags_size_) {
408
+ return;
409
+ }
410
+
411
+ // Atomic exchange - returns true only if it was previously true
412
+ if (!pending_prefetch_flags_[block_index].exchange(false)) {
413
+ return; // Already removed or never pending
414
+ }
415
+
416
+ if (pending_request_) {
417
+ MutexLock lock(&pending_request_->groups_mutex_);
418
+ pending_request_->block_indices_to_prefetch.erase(block_index);
419
+ pending_request_->pending_bytes_ -= block_sizes_[block_index];
420
+ }
421
+ }
422
+
423
+ // IODispatcherImpl::Impl inherits from IODispatcherImplData
424
+ struct IODispatcherImpl::Impl : public IODispatcherImplData,
425
+ public std::enable_shared_from_this<Impl> {
426
+ explicit Impl(const IODispatcherOptions& options);
427
+ ~Impl() override;
428
+
429
+ // Non-copyable and non-movable
430
+ Impl(const Impl&) = delete;
431
+ Impl& operator=(const Impl&) = delete;
432
+ Impl(Impl&&) = delete;
433
+ Impl& operator=(Impl&&) = delete;
434
+
435
+ Status SubmitJob(const std::shared_ptr<IOJob>& job,
436
+ std::shared_ptr<ReadSet>* read_set);
437
+
438
+ // Memory management methods - non-blocking
439
+ bool TryAcquireMemory(size_t bytes);
440
+ void ReleaseMemory(size_t bytes) override;
441
+
442
+ // Memory limiting state
443
+ size_t max_prefetch_memory_bytes_ = 0;
444
+ std::atomic<size_t> memory_used_{0}; // Atomic for lock-free accounting
445
+ std::atomic<bool> has_pending_requests_{false}; // Fast-path check
446
+ port::Mutex memory_mutex_; // Only for pending_prefetch_queue_ access
447
+ std::deque<std::shared_ptr<PendingPrefetchRequest>> pending_prefetch_queue_;
448
+ Statistics* statistics_ = nullptr;
449
+
450
+ private:
451
+ void PrepareIORequests(
452
+ const std::shared_ptr<IOJob>& job,
453
+ const std::vector<size_t>& block_indices_to_read,
454
+ const std::vector<BlockHandle>& block_handles,
455
+ std::vector<FSReadRequest>* read_reqs,
456
+ std::vector<std::vector<size_t>>* coalesced_block_indices);
457
+
458
+ // Surface actual async IO errors to caller, but allow fallback for
459
+ // unsupported cases. Returns block indices that need sync fallback.
460
+ std::vector<size_t> ExecuteAsyncIO(
461
+ const std::shared_ptr<IOJob>& job,
462
+ const std::shared_ptr<ReadSet>& read_set,
463
+ std::vector<FSReadRequest>& read_reqs,
464
+ const std::vector<std::vector<size_t>>& coalesced_block_indices,
465
+ Status* out_status);
466
+
467
+ Status ExecuteSyncIO(
468
+ const std::shared_ptr<IOJob>& job,
469
+ const std::shared_ptr<ReadSet>& read_set,
470
+ std::vector<FSReadRequest>& read_reqs,
471
+ const std::vector<std::vector<size_t>>& coalesced_block_indices);
472
+
473
+ // Try to dispatch pending prefetch requests when memory becomes available
474
+ void TryDispatchPendingPrefetches();
475
+
476
+ // Dispatch prefetch for a specific ReadSet (called when memory is available)
477
+ void DispatchPrefetch(const std::shared_ptr<ReadSet>& read_set,
478
+ const std::shared_ptr<IOJob>& job,
479
+ const std::vector<size_t>& block_indices);
480
+
481
+ // Pre-coalesce blocks into groups, respecting max_group_bytes size limit.
482
+ // Returns groups ordered by first block index (earlier blocks first).
483
+ std::vector<CoalescedPrefetchGroup> PreCoalesceBlocks(
484
+ const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
485
+ const std::vector<size_t>& block_indices, size_t max_group_bytes);
486
+ };
487
+
488
+ IODispatcherImpl::Impl::Impl(const IODispatcherOptions& options)
489
+ : max_prefetch_memory_bytes_(options.max_prefetch_memory_bytes),
490
+ statistics_(options.statistics) {}
491
+
492
+ IODispatcherImpl::Impl::~Impl() {}
493
+
494
+ bool IODispatcherImpl::Impl::TryAcquireMemory(size_t bytes) {
495
+ if (max_prefetch_memory_bytes_ == 0) {
496
+ return true; // No limit configured
497
+ }
498
+
499
+ // Lock-free memory acquisition using compare-exchange
500
+ size_t current = memory_used_.load(std::memory_order_relaxed);
501
+ while (true) {
502
+ if (current + bytes > max_prefetch_memory_bytes_) {
503
+ // Not enough memory - caller should queue for later
504
+ RecordTick(statistics_, PREFETCH_MEMORY_REQUESTS_BLOCKED);
505
+ return false;
506
+ }
507
+ if (memory_used_.compare_exchange_weak(current, current + bytes,
508
+ std::memory_order_release,
509
+ std::memory_order_relaxed)) {
510
+ RecordTick(statistics_, PREFETCH_MEMORY_BYTES_GRANTED, bytes);
511
+ return true;
512
+ }
513
+ // current is updated by compare_exchange_weak on failure, retry
514
+ }
515
+ }
516
+
517
+ void IODispatcherImpl::Impl::ReleaseMemory(size_t bytes) {
518
+ if (max_prefetch_memory_bytes_ == 0) {
519
+ return; // No limit configured
520
+ }
521
+
522
+ // Lock-free memory release using atomic fetch_sub
523
+ size_t old_val = memory_used_.fetch_sub(bytes, std::memory_order_release);
524
+ assert(old_val >= bytes);
525
+ (void)old_val; // Suppress unused warning in release builds
526
+ RecordTick(statistics_, PREFETCH_MEMORY_BYTES_RELEASED, bytes);
527
+
528
+ // Fast-path: skip dispatch attempt if no pending requests
529
+ // This avoids mutex contention in the common single-threaded iterator case
530
+ if (!has_pending_requests_.load(std::memory_order_acquire)) {
531
+ return;
532
+ }
533
+
534
+ // Try to dispatch pending prefetches now that memory is available
535
+ TryDispatchPendingPrefetches();
536
+ }
537
+
538
+ void IODispatcherImpl::Impl::TryDispatchPendingPrefetches() {
539
+ // Process pending prefetch requests - dispatch entire coalesced groups
540
+ while (true) {
541
+ std::shared_ptr<PendingPrefetchRequest> pending;
542
+
543
+ {
544
+ MutexLock lock(&memory_mutex_);
545
+ if (pending_prefetch_queue_.empty()) {
546
+ has_pending_requests_.store(false, std::memory_order_release);
547
+ return;
548
+ }
549
+
550
+ // Get the next pending request
551
+ pending = std::move(pending_prefetch_queue_.front());
552
+ pending_prefetch_queue_.pop_front();
553
+ }
554
+
555
+ // Check if the ReadSet is still alive
556
+ auto read_set = pending->read_set.lock();
557
+ if (!read_set) {
558
+ continue; // ReadSet was destroyed, skip this request
559
+ }
560
+
561
+ // Try to acquire memory for coalesced groups (entire groups at a time)
562
+ std::vector<size_t> blocks_to_dispatch;
563
+ bool has_remaining_groups = false;
564
+
565
+ {
566
+ MutexLock lock(&pending->groups_mutex_);
567
+
568
+ while (!pending->coalesced_groups.empty()) {
569
+ auto& group = pending->coalesced_groups.front();
570
+
571
+ // Filter out blocks that were already read (not in pending set anymore)
572
+ std::vector<size_t> remaining_blocks;
573
+ size_t remaining_bytes = 0;
574
+ for (size_t idx : group.block_indices) {
575
+ if (pending->block_indices_to_prefetch.count(idx) > 0) {
576
+ remaining_blocks.push_back(idx);
577
+ remaining_bytes += read_set->block_sizes_[idx];
578
+ }
579
+ }
580
+
581
+ // Skip empty groups (all blocks were already read)
582
+ if (remaining_blocks.empty()) {
583
+ pending->coalesced_groups.pop_front();
584
+ continue;
585
+ }
586
+
587
+ // Try to acquire memory for remaining blocks only
588
+ if (TryAcquireMemory(remaining_bytes)) {
589
+ // Add all remaining blocks from this group to dispatch
590
+ for (size_t idx : remaining_blocks) {
591
+ blocks_to_dispatch.push_back(idx);
592
+ pending->block_indices_to_prefetch.erase(idx);
593
+ }
594
+ pending->pending_bytes_ -= remaining_bytes;
595
+ pending->coalesced_groups.pop_front();
596
+ } else {
597
+ // Not enough memory for this group - update with remaining blocks
598
+ group.block_indices = std::move(remaining_blocks);
599
+ group.total_bytes = remaining_bytes;
600
+ has_remaining_groups = true;
601
+ break;
602
+ }
603
+ }
604
+ }
605
+
606
+ // Save job before potential move of pending
607
+ auto job = pending->job;
608
+
609
+ // Requeue if groups remain
610
+ if (has_remaining_groups) {
611
+ MutexLock lock(&memory_mutex_);
612
+ pending_prefetch_queue_.push_front(std::move(pending));
613
+ } else {
614
+ // All groups dispatched, clear pending state
615
+ read_set->pending_request_.reset();
616
+ }
617
+
618
+ // Clear pending flags for dispatched blocks
619
+ if (read_set->pending_prefetch_flags_) {
620
+ for (size_t idx : blocks_to_dispatch) {
621
+ if (idx < read_set->pending_prefetch_flags_size_) {
622
+ read_set->pending_prefetch_flags_[idx].store(false);
623
+ }
624
+ }
625
+ }
626
+
627
+ // Dispatch acquired blocks
628
+ if (!blocks_to_dispatch.empty()) {
629
+ DispatchPrefetch(read_set, job, blocks_to_dispatch);
630
+ }
631
+
632
+ // If we dispatched nothing, stop (no memory available for any group)
633
+ if (blocks_to_dispatch.empty()) {
634
+ return;
635
+ }
636
+ }
637
+ }
638
+
639
+ void IODispatcherImpl::Impl::DispatchPrefetch(
640
+ const std::shared_ptr<ReadSet>& read_set, const std::shared_ptr<IOJob>& job,
641
+ const std::vector<size_t>& block_indices) {
642
+ // Sync point for testing partial prefetch - passes number of blocks being
643
+ // dispatched
644
+ TEST_SYNC_POINT_CALLBACK("IODispatcherImpl::DispatchPrefetch:BlockCount",
645
+ const_cast<std::vector<size_t>*>(&block_indices));
646
+
647
+ // Prepare and execute IO for the given blocks
648
+ std::vector<FSReadRequest> read_reqs;
649
+ std::vector<std::vector<size_t>> coalesced_block_indices;
650
+ PrepareIORequests(job, block_indices, job->block_handles, &read_reqs,
651
+ &coalesced_block_indices);
652
+
653
+ if (job->job_options.read_options.async_io) {
654
+ Status async_status;
655
+ std::vector<size_t> fallback_indices = ExecuteAsyncIO(
656
+ job, read_set, read_reqs, coalesced_block_indices, &async_status);
657
+
658
+ // For blocks where async is not supported, do sync IO
659
+ if (!fallback_indices.empty()) {
660
+ std::vector<FSReadRequest> sync_read_reqs;
661
+ std::vector<std::vector<size_t>> sync_coalesced_indices;
662
+ PrepareIORequests(job, fallback_indices, job->block_handles,
663
+ &sync_read_reqs, &sync_coalesced_indices);
664
+ // Prefetch errors are ignored - user will get the error when reading
665
+ Status s =
666
+ ExecuteSyncIO(job, read_set, sync_read_reqs, sync_coalesced_indices);
667
+ s.PermitUncheckedError();
668
+ read_set->num_sync_reads_ += fallback_indices.size();
669
+ }
670
+ // Async errors are also ignored - user will get the error when reading
671
+ async_status.PermitUncheckedError();
672
+ } else {
673
+ // Prefetch errors are ignored - user will get the error when reading
674
+ Status s = ExecuteSyncIO(job, read_set, read_reqs, coalesced_block_indices);
675
+ s.PermitUncheckedError();
676
+ read_set->num_sync_reads_ += block_indices.size();
677
+ }
678
+ }
679
+
680
+ Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
681
+ std::shared_ptr<ReadSet>* read_set) {
682
+ if (!read_set) {
683
+ return Status::InvalidArgument("read_set output parameter is null");
684
+ }
685
+
686
+ auto rs = std::make_shared<ReadSet>();
687
+
688
+ // Initialize ReadSet
689
+ rs->job_ = job;
690
+ rs->fs_ = job->table->get_rep()->ioptions.env->GetFileSystem();
691
+ rs->pinned_blocks_.resize(job->block_handles.size());
692
+ rs->block_sizes_.resize(job->block_handles.size(), 0);
693
+
694
+ // Build sorted index for O(log n) ReadOffset lookups via binary search.
695
+ // sorted_block_indices_[i] = original index of i-th smallest block by offset.
696
+ rs->sorted_block_indices_.resize(job->block_handles.size());
697
+ for (size_t i = 0; i < job->block_handles.size(); ++i) {
698
+ rs->sorted_block_indices_[i] = i;
699
+ }
700
+ std::sort(rs->sorted_block_indices_.begin(), rs->sorted_block_indices_.end(),
701
+ [&job](size_t a, size_t b) {
702
+ return job->block_handles[a].offset() <
703
+ job->block_handles[b].offset();
704
+ });
705
+
706
+ // Step 1: Check cache and pin cached blocks
707
+ std::vector<size_t> block_indices_to_read;
708
+
709
+ for (size_t i = 0; i < job->block_handles.size(); ++i) {
710
+ const auto& data_block_handle = job->block_handles[i];
711
+
712
+ // Lookup and pin block in cache
713
+ Status s = job->table->LookupAndPinBlocksInCache<Block_kData>(
714
+ job->job_options.read_options, data_block_handle,
715
+ &(rs->pinned_blocks_)[i].As<Block_kData>());
716
+
717
+ if (!s.ok()) {
718
+ continue;
719
+ }
720
+
721
+ if (!(rs->pinned_blocks_)[i].GetValue()) {
722
+ // Block not in cache - needs to be read from disk
723
+ block_indices_to_read.emplace_back(i);
724
+ }
725
+ }
726
+
727
+ // Step 2: Prepare IO requests for blocks not in cache
728
+ if (block_indices_to_read.empty()) {
729
+ // All blocks found in cache - count them as cache hits
730
+ rs->num_cache_hits_ = job->block_handles.size();
731
+ *read_set = std::move(rs);
732
+ return Status::OK();
733
+ }
734
+
735
+ // Count cache hits (blocks that were found in cache during lookup above)
736
+ rs->num_cache_hits_ =
737
+ job->block_handles.size() - block_indices_to_read.size();
738
+
739
+ // Calculate block sizes for uncached blocks
740
+ for (const auto& idx : block_indices_to_read) {
741
+ size_t block_size =
742
+ BlockBasedTable::BlockSizeWithTrailer(job->block_handles[idx]);
743
+ rs->block_sizes_[idx] = block_size;
744
+ }
745
+
746
+ // Store dispatcher reference for release callbacks
747
+ rs->dispatcher_data_ = shared_from_this();
748
+
749
+ // Pre-coalesce blocks into groups, respecting memory budget per group
750
+ // This ensures we dispatch meaningful IO sizes, not tiny single-block IOs
751
+ // Both memory-limited and non-memory-limited paths use the same coalescing
752
+ auto coalesced_groups = PreCoalesceBlocks(job, rs, block_indices_to_read,
753
+ max_prefetch_memory_bytes_);
754
+
755
+ std::vector<size_t> blocks_to_dispatch;
756
+ std::deque<CoalescedPrefetchGroup> groups_to_queue;
757
+
758
+ // Try to acquire memory for entire coalesced groups
759
+ for (auto& group : coalesced_groups) {
760
+ if (TryAcquireMemory(group.total_bytes)) {
761
+ // Add all blocks from this group to dispatch
762
+ for (size_t idx : group.block_indices) {
763
+ blocks_to_dispatch.push_back(idx);
764
+ }
765
+ } else {
766
+ // Queue this group for later
767
+ groups_to_queue.push_back(std::move(group));
768
+ }
769
+ }
770
+
771
+ // Dispatch acquired blocks immediately
772
+ if (!blocks_to_dispatch.empty()) {
773
+ DispatchPrefetch(rs, job, blocks_to_dispatch);
774
+ }
775
+
776
+ // Queue remaining groups for later (only applies when memory limiting)
777
+ if (!groups_to_queue.empty()) {
778
+ auto pending = std::make_shared<PendingPrefetchRequest>();
779
+ pending->read_set = rs;
780
+ pending->job = job;
781
+
782
+ size_t pending_bytes = 0;
783
+ for (const auto& group : groups_to_queue) {
784
+ for (size_t idx : group.block_indices) {
785
+ pending->block_indices_to_prefetch.insert(idx);
786
+ }
787
+ pending_bytes += group.total_bytes;
788
+ }
789
+ pending->coalesced_groups = std::move(groups_to_queue);
790
+ pending->pending_bytes_ = pending_bytes;
791
+
792
+ // Set up pending flags for queued blocks only
793
+ size_t num_blocks = job->block_handles.size();
794
+ rs->pending_prefetch_flags_ =
795
+ std::make_unique<std::atomic<bool>[]>(num_blocks);
796
+ rs->pending_prefetch_flags_size_ = num_blocks;
797
+ for (size_t idx : pending->block_indices_to_prefetch) {
798
+ rs->pending_prefetch_flags_[idx].store(true);
799
+ }
800
+ rs->pending_request_ = pending;
801
+
802
+ {
803
+ MutexLock lock(&memory_mutex_);
804
+ pending_prefetch_queue_.push_back(std::move(pending));
805
+ has_pending_requests_.store(true, std::memory_order_release);
806
+ }
807
+ }
808
+
809
+ *read_set = std::move(rs);
810
+ return Status::OK();
811
+ }
812
+
813
+ void IODispatcherImpl::Impl::PrepareIORequests(
814
+ const std::shared_ptr<IOJob>& job,
815
+ const std::vector<size_t>& block_indices_to_read,
816
+ const std::vector<BlockHandle>& block_handles,
817
+ std::vector<FSReadRequest>* read_reqs,
818
+ std::vector<std::vector<size_t>>* coalesced_block_indices) {
819
+ // This is necessary because block handles may not be in sorted order
820
+ std::vector<size_t> sorted_block_indices = block_indices_to_read;
821
+ std::sort(sorted_block_indices.begin(), sorted_block_indices.end(),
822
+ [&block_handles](size_t a, size_t b) {
823
+ return block_handles[a].offset() < block_handles[b].offset();
824
+ });
825
+
826
+ assert(coalesced_block_indices->empty());
827
+ coalesced_block_indices->resize(1);
828
+
829
+ for (const auto& block_idx : sorted_block_indices) {
830
+ if (!coalesced_block_indices->back().empty()) {
831
+ // Check if we can coalesce with previous block
832
+ const auto& last_block_handle =
833
+ block_handles[coalesced_block_indices->back().back()];
834
+ uint64_t last_block_end =
835
+ last_block_handle.offset() +
836
+ BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
837
+ uint64_t current_start = block_handles[block_idx].offset();
838
+
839
+ if (current_start >
840
+ last_block_end + job->job_options.io_coalesce_threshold) {
841
+ // Gap too large - start new IO request
842
+ coalesced_block_indices->emplace_back();
843
+ }
844
+ }
845
+ coalesced_block_indices->back().emplace_back(block_idx);
846
+ }
847
+
848
+ // Create FSReadRequest for each coalesced group
849
+ assert(read_reqs->empty());
850
+ read_reqs->reserve(coalesced_block_indices->size());
851
+
852
+ for (const auto& block_indices : *coalesced_block_indices) {
853
+ assert(!block_indices.empty());
854
+
855
+ // Find the min and max offsets in this coalesced group
856
+ // Since blocks are now sorted, first has min offset and last has max
857
+ const auto& first_block_handle = block_handles[block_indices[0]];
858
+ const auto& last_block_handle = block_handles[block_indices.back()];
859
+
860
+ const auto start_offset = first_block_handle.offset();
861
+ const auto end_offset =
862
+ last_block_handle.offset() +
863
+ BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
864
+
865
+ assert(end_offset > start_offset);
866
+
867
+ read_reqs->emplace_back();
868
+ read_reqs->back().offset = start_offset;
869
+ read_reqs->back().len = end_offset - start_offset;
870
+ read_reqs->back().scratch = nullptr;
871
+ }
872
+ }
873
+
874
+ std::vector<CoalescedPrefetchGroup> IODispatcherImpl::Impl::PreCoalesceBlocks(
875
+ const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
876
+ const std::vector<size_t>& block_indices, size_t max_group_bytes) {
877
+ std::vector<CoalescedPrefetchGroup> groups;
878
+
879
+ if (block_indices.empty()) {
880
+ return groups;
881
+ }
882
+
883
+ const auto& block_handles = job->block_handles;
884
+ const uint64_t coalesce_threshold = job->job_options.io_coalesce_threshold;
885
+
886
+ // Sort block indices by offset for coalescing
887
+ std::vector<size_t> sorted_indices = block_indices;
888
+ std::sort(sorted_indices.begin(), sorted_indices.end(),
889
+ [&block_handles](size_t a, size_t b) {
890
+ return block_handles[a].offset() < block_handles[b].offset();
891
+ });
892
+
893
+ // Build coalesced groups respecting max_group_bytes
894
+ groups.emplace_back();
895
+
896
+ for (size_t idx : sorted_indices) {
897
+ size_t block_size = rs->block_sizes_[idx];
898
+
899
+ // Skip blocks that are individually larger than the memory budget
900
+ // These will be read synchronously when needed (via ReadIndex fallback)
901
+ if (max_group_bytes > 0 && block_size > max_group_bytes) {
902
+ continue;
903
+ }
904
+
905
+ // Check if we need to start a new group
906
+ bool start_new_group = false;
907
+
908
+ if (!groups.back().block_indices.empty()) {
909
+ // Check gap with previous block
910
+ size_t last_idx = groups.back().block_indices.back();
911
+ const auto& last_handle = block_handles[last_idx];
912
+ uint64_t last_end = last_handle.offset() +
913
+ BlockBasedTable::BlockSizeWithTrailer(last_handle);
914
+ uint64_t current_start = block_handles[idx].offset();
915
+
916
+ if (current_start > last_end + coalesce_threshold) {
917
+ start_new_group = true; // Gap too large
918
+ } else if (max_group_bytes > 0 &&
919
+ groups.back().total_bytes + block_size > max_group_bytes) {
920
+ start_new_group = true; // Would exceed size limit
921
+ }
922
+ }
923
+
924
+ if (start_new_group) {
925
+ groups.emplace_back();
926
+ }
927
+
928
+ groups.back().block_indices.push_back(idx);
929
+ groups.back().total_bytes += block_size;
930
+ }
931
+
932
+ return groups;
933
+ }
934
+
935
+ std::vector<size_t> IODispatcherImpl::Impl::ExecuteAsyncIO(
936
+ const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
937
+ std::vector<FSReadRequest>& read_reqs,
938
+ const std::vector<std::vector<size_t>>& coalesced_block_indices,
939
+ Status* out_status) {
940
+ std::vector<size_t> fallback_block_indices;
941
+ *out_status = Status::OK();
942
+
943
+ // Get file and IO options
944
+ auto* rep = job->table->get_rep();
945
+ IOOptions io_opts;
946
+ Status s =
947
+ rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
948
+ if (!s.ok()) {
949
+ *out_status = s;
950
+ return fallback_block_indices;
951
+ }
952
+
953
+ const bool direct_io = rep->file->use_direct_io();
954
+
955
+ // Submit async read requests and store them in the ReadSet
956
+ for (size_t i = 0; i < read_reqs.size(); ++i) {
957
+ auto async_state = std::make_shared<AsyncIOState>();
958
+
959
+ async_state->offset = read_reqs[i].offset;
960
+ async_state->block_indices = coalesced_block_indices[i];
961
+ async_state->read_req = std::move(read_reqs[i]);
962
+
963
+ for (const auto idx : coalesced_block_indices[i]) {
964
+ async_state->blocks.emplace_back(job->block_handles[idx]);
965
+ }
966
+
967
+ if (direct_io) {
968
+ async_state->read_req.scratch = nullptr;
969
+ } else {
970
+ async_state->buf.reset(new char[async_state->read_req.len]);
971
+ async_state->read_req.scratch = async_state->buf.get();
972
+ }
973
+
974
+ // Callback for async read completion
975
+ // Store the result slice and status back into async_state so we can access
976
+ // them after Poll() completes.
977
+ auto cb = [](const FSReadRequest& req, void* cb_arg) {
978
+ auto* state = static_cast<AsyncIOState*>(cb_arg);
979
+ state->read_req.result = req.result;
980
+ state->read_req.status = req.status;
981
+ };
982
+
983
+ s = rep->file->ReadAsync(async_state->read_req, io_opts, cb,
984
+ async_state.get(), &async_state->io_handle,
985
+ &async_state->del_fn,
986
+ direct_io ? &async_state->aligned_buf : nullptr);
987
+
988
+ if (!s.ok()) {
989
+ // Actual error - surface to caller
990
+ *out_status = s;
991
+ return fallback_block_indices;
992
+ }
993
+
994
+ if (async_state->io_handle == nullptr) {
995
+ // Async IO not supported - add to fallback list for sync IO
996
+ for (const auto idx : coalesced_block_indices[i]) {
997
+ fallback_block_indices.push_back(idx);
998
+ }
999
+ continue;
1000
+ }
1001
+
1002
+ // Add async state to map for all blocks in this request
1003
+ for (const auto idx : async_state->block_indices) {
1004
+ read_set->async_io_map_[idx] = async_state;
1005
+ }
1006
+ }
1007
+
1008
+ return fallback_block_indices;
1009
+ }
1010
+
1011
+ Status IODispatcherImpl::Impl::ExecuteSyncIO(
1012
+ const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
1013
+ std::vector<FSReadRequest>& read_reqs,
1014
+ const std::vector<std::vector<size_t>>& coalesced_block_indices) {
1015
+ // Get file and IO options
1016
+ auto* rep = job->table->get_rep();
1017
+ IOOptions io_opts;
1018
+ if (Status s =
1019
+ rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
1020
+ !s.ok()) {
1021
+ return s;
1022
+ }
1023
+
1024
+ const bool direct_io = rep->file->use_direct_io();
1025
+
1026
+ // Setup scratch buffers for MultiRead
1027
+ std::unique_ptr<char[]> buf;
1028
+
1029
+ if (direct_io) {
1030
+ for (auto& read_req : read_reqs) {
1031
+ read_req.scratch = nullptr;
1032
+ }
1033
+ } else {
1034
+ // Allocate a single contiguous buffer for all requests
1035
+ size_t total_len = 0;
1036
+ for (const auto& req : read_reqs) {
1037
+ total_len += req.len;
1038
+ }
1039
+ buf.reset(new char[total_len]);
1040
+ size_t offset = 0;
1041
+ for (auto& read_req : read_reqs) {
1042
+ read_req.scratch = buf.get() + offset;
1043
+ offset += read_req.len;
1044
+ }
1045
+ }
1046
+
1047
+ // Execute MultiRead
1048
+ AlignedBuf aligned_buf;
1049
+ if (Status s =
1050
+ rep->file->MultiRead(io_opts, read_reqs.data(), read_reqs.size(),
1051
+ direct_io ? &aligned_buf : nullptr);
1052
+ !s.ok()) {
1053
+ return s;
1054
+ }
1055
+
1056
+ for (const auto& rq : read_reqs) {
1057
+ if (!rq.status.ok()) {
1058
+ return rq.status;
1059
+ }
1060
+ }
1061
+
1062
+ // Process all blocks from the MultiRead results
1063
+ for (size_t i = 0; i < coalesced_block_indices.size(); ++i) {
1064
+ const auto& read_req = read_reqs[i];
1065
+ for (const auto& block_idx : coalesced_block_indices[i]) {
1066
+ const auto& block_handle = job->block_handles[block_idx];
1067
+
1068
+ Status create_status = CreateAndPinBlockFromBuffer(
1069
+ job, block_handle, read_req.offset, read_req.result,
1070
+ read_set->pinned_blocks_[block_idx]);
1071
+ if (!create_status.ok()) {
1072
+ return create_status;
1073
+ }
1074
+ }
1075
+ }
1076
+
1077
+ return Status::OK();
1078
+ }
1079
+
1080
+ IODispatcherImpl::IODispatcherImpl()
1081
+ : impl_(std::make_shared<Impl>(IODispatcherOptions())) {}
1082
+
1083
+ IODispatcherImpl::IODispatcherImpl(const IODispatcherOptions& options)
1084
+ : impl_(std::make_shared<Impl>(options)) {}
1085
+
1086
+ IODispatcherImpl::~IODispatcherImpl() = default;
1087
+
1088
+ Status IODispatcherImpl::SubmitJob(const std::shared_ptr<IOJob>& job,
1089
+ std::shared_ptr<ReadSet>* read_set) {
1090
+ return impl_->SubmitJob(job, read_set);
1091
+ }
1092
+
1093
+ IODispatcher* NewIODispatcher() { return new IODispatcherImpl(); }
1094
+
1095
+ IODispatcher* NewIODispatcher(const IODispatcherOptions& options) {
1096
+ return new IODispatcherImpl(options);
1097
+ }
1098
+
1099
+ } // namespace ROCKSDB_NAMESPACE