@nxtedition/rocksdb 15.4.0 → 15.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (402) hide show
  1. package/binding.cc +24 -19
  2. package/cache.js +1 -1
  3. package/chained-batch.js +12 -3
  4. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  5. package/deps/rocksdb/rocksdb/BUCK +42 -0
  6. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  7. package/deps/rocksdb/rocksdb/Makefile +59 -32
  8. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  9. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  10. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  11. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  12. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  13. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  15. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  19. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  26. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  28. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  29. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  31. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  33. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  34. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  52. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  53. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  54. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  55. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  57. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  58. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  59. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  60. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  61. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  62. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  63. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  64. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  65. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  66. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  67. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  68. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  79. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  80. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  81. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  82. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  83. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  84. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  85. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  86. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  87. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  88. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  89. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  90. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  91. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  92. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  93. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  94. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  95. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  96. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  97. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  98. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  99. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  100. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  101. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  102. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  103. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  105. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  106. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  107. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  110. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  111. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  112. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  113. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  115. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  116. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  118. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  119. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  120. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  121. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  122. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  123. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  124. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  125. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  126. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  127. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  128. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  129. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  130. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  131. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  132. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  133. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  134. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  135. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  136. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  137. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  138. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  139. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  140. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  141. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  142. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  143. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  144. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  145. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  146. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  147. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  148. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  150. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  151. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  152. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  153. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  160. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  161. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  162. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  163. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  164. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  165. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  166. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  167. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  168. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  169. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  171. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  173. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  174. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  175. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  176. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  177. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  180. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  181. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  182. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  183. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  184. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  185. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  187. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  188. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  189. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  192. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  194. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  197. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  198. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  199. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  200. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  202. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  204. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  205. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  206. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  210. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  211. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  212. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  213. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  214. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  215. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  216. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  217. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  218. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  219. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  220. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  221. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  222. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  223. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  224. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  225. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  226. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  227. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  228. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  229. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  230. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  231. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  232. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  233. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  234. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  235. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  236. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  237. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  238. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  239. package/deps/rocksdb/rocksdb/src.mk +12 -0
  240. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  241. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  243. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  253. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  254. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  255. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  256. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  257. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  258. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  259. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  260. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  261. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  263. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  264. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  265. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  266. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  267. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  268. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  269. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  270. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  273. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  274. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  275. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  276. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  277. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  278. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  279. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  280. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  281. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  282. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  283. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  284. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  286. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  287. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  288. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  289. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  290. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  291. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  292. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  293. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  294. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  295. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  296. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  297. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  298. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  299. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  300. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  301. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  302. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  303. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  304. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  305. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  306. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  307. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  308. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  309. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  310. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  311. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  312. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  313. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  314. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  315. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  316. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  317. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  318. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  319. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  320. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  321. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  322. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  323. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  324. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  325. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  326. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  328. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  329. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  331. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  332. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  333. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  334. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  335. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  336. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  337. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  338. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  339. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  340. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  341. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  342. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  343. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  344. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  355. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  356. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  358. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  360. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  361. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  362. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  364. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  365. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  366. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  367. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  368. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  369. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  370. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  371. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  373. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  375. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  376. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  377. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  378. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  380. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  381. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  388. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  389. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  390. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  391. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  392. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  393. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  394. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  395. package/deps/rocksdb/rocksdb.gyp +7 -0
  396. package/index.js +11 -2
  397. package/iterator.js +15 -7
  398. package/package.json +1 -1
  399. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  400. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  402. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -0,0 +1,1919 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+
6
+ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
7
+ // This source code is licensed under both the GPLv2 (found in the
8
+ // COPYING file in the root directory) and Apache 2.0 License
9
+ // (found in the LICENSE.Apache file in the root directory).
10
+
11
+ #include "rocksdb/io_dispatcher.h"
12
+
13
+ #include <memory>
14
+ #include <mutex>
15
+ #include <thread>
16
+
17
+ #include "db/db_test_util.h"
18
+ #include "db/dbformat.h"
19
+ #include "file/writable_file_writer.h"
20
+ #include "rocksdb/cache.h"
21
+ #include "rocksdb/env.h"
22
+ #include "rocksdb/options.h"
23
+ #include "rocksdb/table.h"
24
+ #include "table/block_based/block_based_table_builder.h"
25
+ #include "table/block_based/block_based_table_factory.h"
26
+ #include "table/block_based/block_based_table_reader.h"
27
+ #include "test_util/sync_point.h"
28
+
29
+ // Enable io_uring support for this test
30
+ extern "C" bool RocksDbIOUringEnable() { return true; }
31
+
32
+ // Check if io_uring is available at compile time
33
+ #ifdef ROCKSDB_IOURING_PRESENT
34
+ static constexpr bool kIOUringPresent = true;
35
+ #else
36
+ static constexpr bool kIOUringPresent = false;
37
+ #endif
38
+
39
+ namespace ROCKSDB_NAMESPACE {
40
+
41
+ // Represents a single read operation recorded by the tracking file system
42
+ struct ReadOp {
43
+ enum Type { kMultiRead, kReadAsync };
44
+ Type type;
45
+ // For MultiRead: contains all (offset, len) pairs in the request
46
+ // For ReadAsync: contains a single (offset, len) pair
47
+ std::vector<std::pair<uint64_t, size_t>> requests;
48
+ };
49
+
50
+ // Forward declaration
51
+ class ReadTrackingFS;
52
+
53
+ // Wrapper around FSRandomAccessFile that tracks read operations
54
+ class ReadTrackingRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
55
+ public:
56
+ ReadTrackingRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
57
+ ReadTrackingFS* fs)
58
+ : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
59
+
60
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
61
+ const IOOptions& options, IODebugContext* dbg) override;
62
+
63
+ IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
64
+ std::function<void(FSReadRequest&, void*)> cb,
65
+ void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
66
+ IODebugContext* dbg) override;
67
+
68
+ private:
69
+ ReadTrackingFS* fs_;
70
+ };
71
+
72
+ // FileSystem wrapper that tracks all read operations for verification
73
+ class ReadTrackingFS : public FileSystemWrapper {
74
+ public:
75
+ explicit ReadTrackingFS(const std::shared_ptr<FileSystem>& target)
76
+ : FileSystemWrapper(target) {}
77
+
78
+ static const char* kClassName() { return "ReadTrackingFS"; }
79
+ const char* Name() const override { return kClassName(); }
80
+
81
+ IOStatus NewRandomAccessFile(const std::string& fname,
82
+ const FileOptions& opts,
83
+ std::unique_ptr<FSRandomAccessFile>* result,
84
+ IODebugContext* dbg) override {
85
+ std::unique_ptr<FSRandomAccessFile> file;
86
+ IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
87
+ if (s.ok()) {
88
+ result->reset(new ReadTrackingRandomAccessFile(std::move(file), this));
89
+ }
90
+ return s;
91
+ }
92
+
93
+ // Record a MultiRead operation
94
+ void RecordMultiRead(const std::vector<std::pair<uint64_t, size_t>>& reqs) {
95
+ std::lock_guard<std::mutex> lock(mutex_);
96
+ ReadOp op;
97
+ op.type = ReadOp::kMultiRead;
98
+ op.requests = reqs;
99
+ read_ops_.push_back(std::move(op));
100
+ }
101
+
102
+ // Record a ReadAsync operation
103
+ void RecordReadAsync(uint64_t offset, size_t len) {
104
+ std::lock_guard<std::mutex> lock(mutex_);
105
+ ReadOp op;
106
+ op.type = ReadOp::kReadAsync;
107
+ op.requests.push_back({offset, len});
108
+ read_ops_.push_back(std::move(op));
109
+ }
110
+
111
+ // Get all recorded read operations
112
+ std::vector<ReadOp> GetReadOps() const {
113
+ std::lock_guard<std::mutex> lock(mutex_);
114
+ return read_ops_;
115
+ }
116
+
117
+ // Clear recorded read operations
118
+ void ClearReadOps() {
119
+ std::lock_guard<std::mutex> lock(mutex_);
120
+ read_ops_.clear();
121
+ }
122
+
123
+ // Get count of MultiRead operations
124
+ size_t GetMultiReadCount() const {
125
+ std::lock_guard<std::mutex> lock(mutex_);
126
+ size_t count = 0;
127
+ for (const auto& op : read_ops_) {
128
+ if (op.type == ReadOp::kMultiRead) {
129
+ count++;
130
+ }
131
+ }
132
+ return count;
133
+ }
134
+
135
+ // Get count of ReadAsync operations
136
+ size_t GetReadAsyncCount() const {
137
+ std::lock_guard<std::mutex> lock(mutex_);
138
+ size_t count = 0;
139
+ for (const auto& op : read_ops_) {
140
+ if (op.type == ReadOp::kReadAsync) {
141
+ count++;
142
+ }
143
+ }
144
+ return count;
145
+ }
146
+
147
+ private:
148
+ mutable std::mutex mutex_;
149
+ std::vector<ReadOp> read_ops_;
150
+ };
151
+
152
+ IOStatus ReadTrackingRandomAccessFile::MultiRead(FSReadRequest* reqs,
153
+ size_t num_reqs,
154
+ const IOOptions& options,
155
+ IODebugContext* dbg) {
156
+ // Record the read operation before executing it
157
+ std::vector<std::pair<uint64_t, size_t>> recorded_reqs;
158
+ recorded_reqs.reserve(num_reqs);
159
+ for (size_t i = 0; i < num_reqs; i++) {
160
+ recorded_reqs.push_back({reqs[i].offset, reqs[i].len});
161
+ }
162
+ fs_->RecordMultiRead(recorded_reqs);
163
+
164
+ // Delegate to underlying file
165
+ return target()->MultiRead(reqs, num_reqs, options, dbg);
166
+ }
167
+
168
+ IOStatus ReadTrackingRandomAccessFile::ReadAsync(
169
+ FSReadRequest& req, const IOOptions& opts,
170
+ std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
171
+ void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
172
+ // Record the read operation before executing it
173
+ fs_->RecordReadAsync(req.offset, req.len);
174
+
175
+ // Delegate to underlying file
176
+ return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
177
+ }
178
+
179
+ class IODispatcherTest : public DBTestBase {
180
+ public:
181
+ IODispatcherTest()
182
+ : DBTestBase("io_dispatcher_test", /*env_do_fsync=*/false) {}
183
+
184
+ ~IODispatcherTest() override {
185
+ // Close any open tables
186
+ for (auto& table : tables_) {
187
+ table.reset();
188
+ }
189
+ tables_.clear();
190
+ }
191
+
192
+ // Helper to collect block handles from a table
193
+ // We use TEST_GetDataBlockHandle to get handles for specific keys
194
+ // Since we know the keys we inserted, we can collect their block handles
195
+ Status CollectBlockHandles(BlockBasedTable* table, size_t num_keys,
196
+ std::vector<BlockHandle>* block_handles_out) {
197
+ block_handles_out->clear();
198
+
199
+ ReadOptions read_options;
200
+ std::unordered_set<uint64_t> seen_offsets;
201
+
202
+ // Iterate through all keys and get their block handles
203
+ // We collect unique block handles (same block might contain multiple keys)
204
+ IndexBlockIter iiter_on_stack;
205
+ BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
206
+ auto iiter = table->NewIndexIterator(read_options, false, &iiter_on_stack,
207
+ nullptr, &context);
208
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
209
+ if (iiter != &iiter_on_stack) {
210
+ iiter_unique_ptr.reset(iiter);
211
+ }
212
+
213
+ // Position the iterator at the first entry
214
+ iiter->SeekToFirst();
215
+
216
+ while (iiter->Valid()) {
217
+ auto handle = iiter->value().handle;
218
+ if (seen_offsets.find(handle.offset()) == seen_offsets.end()) {
219
+ block_handles_out->push_back(handle);
220
+ seen_offsets.insert(handle.offset());
221
+ if (block_handles_out->size() >= num_keys) {
222
+ break;
223
+ }
224
+ }
225
+ iiter->Next();
226
+ }
227
+
228
+ return Status::OK();
229
+ }
230
+
231
+ std::string test_dir_{};
232
+ Env* env_{};
233
+ std::shared_ptr<FileSystem> base_fs_;
234
+ std::shared_ptr<ReadTrackingFS> tracking_fs_;
235
+
236
+ std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
237
+
238
+ void SetUp() override {
239
+ SetupSyncPointsToMockDirectIO();
240
+ test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
241
+ env_ = Env::Default();
242
+ base_fs_ = FileSystem::Default();
243
+ tracking_fs_ = std::make_shared<ReadTrackingFS>(base_fs_);
244
+ ASSERT_OK(base_fs_->CreateDir(test_dir_, IOOptions(), nullptr));
245
+ }
246
+
247
+ void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
248
+
249
+ void NewFileWriter(const std::string& filename,
250
+ std::unique_ptr<WritableFileWriter>* writer) {
251
+ std::string path = Path(filename);
252
+ EnvOptions env_options;
253
+ FileOptions foptions;
254
+ std::unique_ptr<FSWritableFile> file;
255
+ ASSERT_OK(base_fs_->NewWritableFile(path, foptions, &file, nullptr));
256
+ writer->reset(new WritableFileWriter(std::move(file), path, env_options));
257
+ }
258
+
259
+ void NewFileReader(const std::string& filename, const FileOptions& opt,
260
+ std::unique_ptr<RandomAccessFileReader>* reader,
261
+ Statistics* stats = nullptr) {
262
+ std::string path = Path(filename);
263
+ std::unique_ptr<FSRandomAccessFile> f;
264
+ // Use tracking_fs_ to record read operations
265
+ ASSERT_OK(tracking_fs_->NewRandomAccessFile(path, opt, &f, nullptr));
266
+ reader->reset(new RandomAccessFileReader(std::move(f), path,
267
+ env_->GetSystemClock().get(),
268
+ /*io_tracer=*/nullptr,
269
+ /*stats=*/stats));
270
+ }
271
+
272
+ std::vector<std::shared_ptr<Statistics>> all_stats_;
273
+ std::vector<std::unique_ptr<BlockBasedTable>> tables_;
274
+
275
+ // Options must be stored as member variables to avoid use-after-scope
276
+ // The BlockBasedTable keeps references to these options
277
+ std::vector<std::unique_ptr<ImmutableOptions>> all_ioptions_;
278
+ std::vector<std::unique_ptr<EnvOptions>> all_env_options_;
279
+
280
+ // Helper to create an SST file and open it as a table
281
+ // Following pattern from table_test.cc TableConstructor
282
+ Status CreateAndOpenSST(int num_blocks,
283
+ std::unique_ptr<BlockBasedTable>* table,
284
+ std::vector<BlockHandle>* block_handles_out) {
285
+ // Create options - store in member variables to avoid use-after-scope
286
+ // The BlockBasedTable will keep references to these options
287
+ Options options{};
288
+ options.statistics = nullptr;
289
+ BlockBasedTableOptions table_options;
290
+ table_options.block_cache = NewLRUCache(8 * 1024 * 1024);
291
+ table_options.block_size = 16 * 1024;
292
+ table_options.no_block_cache = false;
293
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
294
+
295
+ // Store these in member variables so they outlive the function
296
+ auto ioptions = std::make_unique<ImmutableOptions>(options);
297
+ auto moptions = MutableCFOptions{options};
298
+ InternalKeyComparator internal_comparator(options.comparator);
299
+
300
+ // Create in-memory file using StringSink (like table_test.cc)
301
+ auto table_name = "test_table";
302
+ std::unique_ptr<WritableFileWriter> file_writer;
303
+ NewFileWriter(table_name, &file_writer);
304
+
305
+ // Create table builder
306
+ std::string column_family_name;
307
+ const ReadOptions read_options;
308
+ const WriteOptions write_options;
309
+ std::vector<std::unique_ptr<InternalTblPropCollFactory>>
310
+ int_tbl_prop_coll_factories;
311
+ TableBuilderOptions builder_options(
312
+ *ioptions, moptions, read_options, write_options, internal_comparator,
313
+ &int_tbl_prop_coll_factories, kNoCompression, options.compression_opts,
314
+ 0 /* column_family_id */, column_family_name, -1 /* level */,
315
+ kUnknownNewestKeyTime);
316
+
317
+ std::unique_ptr<TableBuilder> builder(
318
+ options.table_factory->NewTableBuilder(builder_options,
319
+ file_writer.get()));
320
+
321
+ Status s;
322
+ auto rnd = Random::GetTLSInstance();
323
+ // Add keys to the table
324
+ // 10k * 1Kib = ~10MiB
325
+ for (int i = 0; i < 10000; i++) {
326
+ std::string value = rnd->RandomString(2 << 10);
327
+ InternalKey ikey(Key(i), i, kTypeValue);
328
+ builder->Add(ikey.Encode(), value);
329
+ }
330
+ s = builder->Finish();
331
+ if (!s.ok()) {
332
+ return s;
333
+ }
334
+
335
+ uint64_t file_size = builder->FileSize();
336
+
337
+ IOOptions io_options;
338
+ s = file_writer->Flush(io_options);
339
+ if (!s.ok()) {
340
+ return s;
341
+ }
342
+
343
+ // Now open the file for reading using StringSource (like table_test.cc)
344
+ std::unique_ptr<RandomAccessFileReader> file;
345
+ FileOptions foptions;
346
+ foptions.use_direct_reads = false;
347
+
348
+ NewFileReader(table_name, foptions, &file, nullptr);
349
+
350
+ // Store EnvOptions and InternalKeyComparator to avoid use-after-scope
351
+ auto soptions = std::make_unique<EnvOptions>();
352
+ BlockCacheTracer block_cache_tracer;
353
+ std::unique_ptr<TableReader> table_reader;
354
+
355
+ auto ikc = InternalKeyComparator(options.comparator);
356
+ TableReaderOptions reader_options(*ioptions, moptions.prefix_extractor,
357
+ moptions.compression_manager.get(),
358
+ *soptions, ikc,
359
+ 0 /* block_protection_bytes_per_key */);
360
+
361
+ s = options.table_factory->NewTableReader(reader_options, std::move(file),
362
+ file_size, &table_reader);
363
+
364
+ if (!s.ok()) {
365
+ return s;
366
+ }
367
+
368
+ table->reset(static_cast<BlockBasedTable*>(table_reader.release()));
369
+
370
+ // Collect actual block handles from the table's index
371
+ // This is similar to how block_based_table_iterator.cc CollectBlockHandles
372
+ // works
373
+ s = CollectBlockHandles(table->get(), num_blocks, block_handles_out);
374
+ if (!s.ok()) {
375
+ return s;
376
+ }
377
+
378
+ // Store all options in member variables to keep them alive
379
+ all_ioptions_.push_back(std::move(ioptions));
380
+ all_env_options_.push_back(std::move(soptions));
381
+
382
+ return Status::OK();
383
+ }
384
+
385
+ static uint64_t cur_file_num_;
386
+ };
387
+
388
+ uint64_t IODispatcherTest::cur_file_num_ = 1;
389
+
390
+ TEST_F(IODispatcherTest, BasicSSTRead) {
391
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
392
+
393
+ std::unique_ptr<BlockBasedTable> table;
394
+ std::vector<BlockHandle> block_handles;
395
+ Status s = CreateAndOpenSST(50, &table, &block_handles);
396
+ ASSERT_OK(s);
397
+ ASSERT_NE(table, nullptr);
398
+ ASSERT_GT(block_handles.size(), 0);
399
+
400
+ auto job = std::make_shared<IOJob>();
401
+ job->block_handles = block_handles;
402
+ job->table = table.get();
403
+ ReadOptions read_options;
404
+ // Only use async IO when io_uring is available
405
+ job->job_options.read_options.async_io = kIOUringPresent;
406
+
407
+ std::shared_ptr<ReadSet> read_set;
408
+ s = dispatcher->SubmitJob(job, &read_set);
409
+ ASSERT_OK(s);
410
+ ASSERT_NE(read_set, nullptr);
411
+
412
+ // Read blocks using the new ReadSet API and verify they are valid
413
+ // ReadIndex will poll for async IO completion internally, no need to sleep
414
+ for (size_t i = 0; i < block_handles.size(); ++i) {
415
+ CachableEntry<Block> block;
416
+ Status read_status = read_set->ReadIndex(i, &block);
417
+ ASSERT_OK(read_status);
418
+ ASSERT_NE(block.GetValue(), nullptr);
419
+
420
+ // Verify the block has reasonable content
421
+ const Block* block_ptr = block.GetValue();
422
+ ASSERT_GT(block_ptr->size(), 0);
423
+ }
424
+
425
+ // Verify statistics - some blocks should have been read asynchronously
426
+ // Note: actual counts depend on cache behavior and IO completion
427
+ uint64_t total_reads = read_set->GetNumSyncReads() +
428
+ read_set->GetNumAsyncReads() +
429
+ read_set->GetNumCacheHits();
430
+ ASSERT_EQ(total_reads, block_handles.size());
431
+ }
432
+
433
+ TEST_F(IODispatcherTest, MultipleSSTFiles) {
434
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
435
+
436
+ std::vector<std::shared_ptr<ReadSet>> read_sets;
437
+ std::vector<std::vector<BlockHandle>> all_block_handles;
438
+
439
+ // Create and submit jobs for multiple SST files
440
+ for (int i = 0; i < 3; i++) {
441
+ std::unique_ptr<BlockBasedTable> table;
442
+ std::vector<BlockHandle> block_handles;
443
+
444
+ Status s = CreateAndOpenSST(30 + i * 10, &table, &block_handles);
445
+ ASSERT_OK(s);
446
+
447
+ auto job = std::make_shared<IOJob>();
448
+ job->block_handles = block_handles;
449
+ job->table = table.get();
450
+ tables_.push_back(std::move(table));
451
+
452
+ all_block_handles.push_back(block_handles);
453
+ std::shared_ptr<ReadSet> read_set;
454
+ s = dispatcher->SubmitJob(job, &read_set);
455
+ ASSERT_OK(s);
456
+ read_sets.push_back(read_set);
457
+ }
458
+
459
+ // Verify all ReadSets can read their blocks successfully
460
+ // ReadIndex will poll for async IO completion internally, no need to sleep
461
+ for (size_t i = 0; i < read_sets.size(); ++i) {
462
+ for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
463
+ CachableEntry<Block> block;
464
+ Status read_status = read_sets[i]->ReadIndex(j, &block);
465
+ ASSERT_OK(read_status);
466
+ ASSERT_NE(block.GetValue(), nullptr);
467
+ }
468
+ }
469
+ }
470
+
471
+ TEST_F(IODispatcherTest, StatisticsTracking) {
472
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
473
+
474
+ std::unique_ptr<BlockBasedTable> table;
475
+ std::vector<BlockHandle> block_handles;
476
+ Status s = CreateAndOpenSST(30, &table, &block_handles);
477
+ ASSERT_OK(s);
478
+ ASSERT_NE(table, nullptr);
479
+ ASSERT_GT(block_handles.size(), 0);
480
+
481
+ auto job = std::make_shared<IOJob>();
482
+ job->block_handles = block_handles;
483
+ job->table = table.get();
484
+ // Only use async IO when io_uring is available
485
+ job->job_options.read_options.async_io = kIOUringPresent;
486
+
487
+ std::shared_ptr<ReadSet> read_set;
488
+ s = dispatcher->SubmitJob(job, &read_set);
489
+ ASSERT_OK(s);
490
+ ASSERT_NE(read_set, nullptr);
491
+
492
+ // Read all blocks - ReadIndex handles polling for async IO completion
493
+ for (size_t i = 0; i < block_handles.size(); ++i) {
494
+ CachableEntry<Block> block;
495
+ Status read_status = read_set->ReadIndex(i, &block);
496
+ ASSERT_OK(read_status);
497
+ ASSERT_NE(block.GetValue(), nullptr);
498
+ }
499
+
500
+ // Read the same blocks again - should all be cache hits now
501
+ std::shared_ptr<ReadSet> read_set2;
502
+ s = dispatcher->SubmitJob(job, &read_set2);
503
+ ASSERT_OK(s);
504
+
505
+ for (size_t i = 0; i < block_handles.size(); ++i) {
506
+ CachableEntry<Block> block;
507
+ Status read_status = read_set2->ReadIndex(i, &block);
508
+ ASSERT_OK(read_status);
509
+ ASSERT_NE(block.GetValue(), nullptr);
510
+ }
511
+
512
+ // After reading all blocks, verify statistics
513
+ uint64_t num_sync = read_set->GetNumSyncReads();
514
+ uint64_t num_async = read_set->GetNumAsyncReads();
515
+ uint64_t num_cache = read_set->GetNumCacheHits();
516
+
517
+ // Total reads should equal number of blocks
518
+ uint64_t total_reads = num_sync + num_async + num_cache;
519
+ ASSERT_EQ(total_reads, block_handles.size());
520
+ }
521
+ TEST_F(IODispatcherTest, AsyncAndSyncRead) {
522
+ // This test verifies the difference between async_io=true and async_io=false
523
+ // by checking the statistics after reading all blocks.
524
+ // Only test async_io=true when io_uring is available.
525
+ std::vector<bool> async_modes = {false};
526
+ if (kIOUringPresent) {
527
+ async_modes.push_back(true);
528
+ }
529
+
530
+ for (auto async : async_modes) {
531
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
532
+
533
+ std::unique_ptr<BlockBasedTable> table;
534
+ std::vector<BlockHandle> block_handles;
535
+ Status s = CreateAndOpenSST(40, &table, &block_handles);
536
+ ASSERT_OK(s);
537
+ ASSERT_NE(table, nullptr);
538
+ ASSERT_GT(block_handles.size(), 0);
539
+
540
+ auto job = std::make_shared<IOJob>();
541
+ job->block_handles = block_handles;
542
+ job->table = table.get();
543
+ ReadOptions read_options;
544
+ // Ensure we don't use cache for this test - we want fresh reads
545
+ read_options.fill_cache = false;
546
+ job->job_options.read_options.async_io = async;
547
+
548
+ std::shared_ptr<ReadSet> read_set;
549
+ s = dispatcher->SubmitJob(job, &read_set);
550
+ ASSERT_OK(s);
551
+ ASSERT_NE(read_set, nullptr);
552
+
553
+ // Read all blocks - ReadIndex handles polling for async IO internally
554
+ for (size_t i = 0; i < block_handles.size(); ++i) {
555
+ CachableEntry<Block> block;
556
+ Status read_status = read_set->ReadIndex(i, &block);
557
+ ASSERT_OK(read_status);
558
+ ASSERT_NE(block.GetValue(), nullptr);
559
+
560
+ // Verify the block has reasonable content
561
+ const Block* block_ptr = block.GetValue();
562
+ ASSERT_GT(block_ptr->size(), 0);
563
+ }
564
+
565
+ // Verify statistics
566
+ uint64_t num_sync = read_set->GetNumSyncReads();
567
+ uint64_t num_async = read_set->GetNumAsyncReads();
568
+ uint64_t num_cache = read_set->GetNumCacheHits();
569
+
570
+ // Total reads should equal number of blocks
571
+ uint64_t total_reads = num_sync + num_async + num_cache;
572
+ EXPECT_EQ(total_reads, block_handles.size());
573
+
574
+ // When async_io is false, we always expect sync reads
575
+ if (!async) {
576
+ EXPECT_GT(num_sync, 0) << "Expected sync reads when async_io=false";
577
+ EXPECT_EQ(num_async, 0) << "Expected no async reads when async_io=false";
578
+ }
579
+ // When async_io is true:
580
+ // - If io_uring is available, we expect async reads
581
+ // - If io_uring is NOT available, ReadAsync returns NotSupported and
582
+ // we fall back to sync reads. This is valid behavior.
583
+ // So we only verify that ALL blocks were read (checked above).
584
+ }
585
+ }
586
+
587
+ TEST_F(IODispatcherTest, VerifyBlockContent) {
588
+ // Test that blocks retrieved through ReadSet contain the correct data
589
+ // that was written to the SST file
590
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
591
+
592
+ std::unique_ptr<BlockBasedTable> table;
593
+ std::vector<BlockHandle> block_handles;
594
+ Status s = CreateAndOpenSST(50, &table, &block_handles);
595
+ ASSERT_OK(s);
596
+ ASSERT_NE(table, nullptr);
597
+ ASSERT_GT(block_handles.size(), 0);
598
+
599
+ auto job = std::make_shared<IOJob>();
600
+ job->block_handles = block_handles;
601
+ job->table = table.get();
602
+ ReadOptions read_options;
603
+ job->job_options.read_options.async_io = false;
604
+
605
+ std::shared_ptr<ReadSet> read_set;
606
+ s = dispatcher->SubmitJob(job, &read_set);
607
+ ASSERT_OK(s);
608
+ ASSERT_NE(read_set, nullptr);
609
+
610
+ // Read each block and verify its content
611
+ int t = 0;
612
+ for (size_t i = 0; i < block_handles.size(); ++i) {
613
+ CachableEntry<Block> block_entry;
614
+ Status read_status = read_set->ReadIndex(i, &block_entry);
615
+ ASSERT_OK(read_status);
616
+ ASSERT_NE(block_entry.GetValue(), nullptr);
617
+
618
+ Block* block = block_entry.GetValue();
619
+ ASSERT_GT(block->size(), 0);
620
+
621
+ // Create an iterator to walk through the block's keys
622
+ // We use InternalKeyComparator for data blocks
623
+ InternalKeyComparator internal_comparator(BytewiseComparator());
624
+ std::unique_ptr<DataBlockIter> iter(block->NewDataIterator(
625
+ internal_comparator.user_comparator(), kDisableGlobalSequenceNumber));
626
+
627
+ // Iterate through all keys in this block
628
+ size_t num_keys_in_block = 0;
629
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
630
+ num_keys_in_block++;
631
+
632
+ // Verify key is not empty
633
+ ASSERT_GT(iter->key().size(), 0)
634
+ << "Block " << i << " contains empty key";
635
+
636
+ // Verify value is not empty (we wrote 1KB values)
637
+ ASSERT_GT(iter->value().size(), 2 ^ 10)
638
+ << "Block " << i << " contains empty value";
639
+
640
+ // Parse the internal key
641
+ ParsedInternalKey parsed_key;
642
+ Status parse_status =
643
+ ParseInternalKey(iter->key(), &parsed_key, true /* log_err */);
644
+ ASSERT_OK(parse_status) << "Failed to parse internal key in block " << i;
645
+
646
+ // Verify the key matches the expected format from CreateAndOpenSST
647
+ // Keys are created with Key(i) which generates keys like "key000000"
648
+ std::string user_key = parsed_key.user_key.ToString();
649
+ auto check = Key(t);
650
+ t++;
651
+ ASSERT_TRUE(user_key.find("key") == 0)
652
+ << "Unexpected key format in block " << i << ": " << user_key;
653
+
654
+ ASSERT_EQ(check.c_str(), user_key);
655
+
656
+ // Verify value type is correct (should be kTypeValue)
657
+ ASSERT_EQ(parsed_key.type, kTypeValue)
658
+ << "Unexpected value type in block " << i;
659
+ }
660
+
661
+ // Verify iterator status after iteration
662
+ ASSERT_OK(iter->status()) << "Iterator error in block " << i;
663
+
664
+ // Each block should contain at least one key
665
+ ASSERT_GT(num_keys_in_block, 0) << "Block " << i << " contains no keys";
666
+ }
667
+ }
668
+
669
+ // We want to test here that even when we DONT read from the readset that all
670
+ // pinned blocks will be unpinned.
671
+ TEST_F(IODispatcherTest, ReadSetDestroysUnpinsBlocks) {
672
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
673
+
674
+ std::unique_ptr<BlockBasedTable> table;
675
+ std::vector<BlockHandle> block_handles;
676
+ Status s = CreateAndOpenSST(30, &table, &block_handles);
677
+ ASSERT_OK(s);
678
+ ASSERT_NE(table, nullptr);
679
+ ASSERT_EQ(block_handles.size(), 30);
680
+
681
+ auto job = std::make_shared<IOJob>();
682
+ job->block_handles = block_handles;
683
+ job->table = table.get();
684
+ ReadOptions read_options;
685
+ job->job_options.read_options.async_io =
686
+ false; // Use sync IO so blocks are pinned immediately
687
+
688
+ auto* rep = table->get_rep();
689
+ auto cache = rep->table_options.block_cache.get();
690
+ ASSERT_NE(cache, nullptr);
691
+
692
+ auto initial_pinned_usage = cache->GetPinnedUsage();
693
+ ASSERT_EQ(initial_pinned_usage, 0);
694
+
695
+ {
696
+ std::shared_ptr<ReadSet> read_set;
697
+ Status t = dispatcher->SubmitJob(job, &read_set);
698
+ ASSERT_OK(t);
699
+ ASSERT_NE(read_set, nullptr);
700
+
701
+ // With sync IO, blocks are already pinned in read_set->pinned_blocks_
702
+ // We do NOT call read_set->Read() - blocks should remain in pinned_blocks_
703
+
704
+ // At this point, blocks should be pinned in the ReadSet
705
+ auto pinned_usage_with_blocks = cache->GetPinnedUsage();
706
+ ASSERT_GT(pinned_usage_with_blocks, initial_pinned_usage)
707
+ << "Expected pinned usage to increase after SubmitJob, but "
708
+ << "initial=" << initial_pinned_usage
709
+ << " current=" << pinned_usage_with_blocks;
710
+
711
+ // ReadSet goes out of scope here, its destructor should unpin all blocks
712
+ }
713
+
714
+ // ReadSet destroyed - all blocks should be unpinned
715
+ auto final_pinned_usage = cache->GetPinnedUsage();
716
+ ASSERT_EQ(final_pinned_usage, initial_pinned_usage)
717
+ << "Expected pinned usage to return to initial value after ReadSet "
718
+ << "destruction, but initial=" << initial_pinned_usage
719
+ << " final=" << final_pinned_usage;
720
+ }
721
+
722
+ // Test that verifies the coalescing logic: adjacent blocks within the
723
+ // coalesce threshold should be combined into a single read request.
724
+ TEST_F(IODispatcherTest, VerifyCoalescing) {
725
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
726
+
727
+ std::unique_ptr<BlockBasedTable> table;
728
+ std::vector<BlockHandle> block_handles;
729
+ // Get many blocks so we can test coalescing behavior
730
+ Status s = CreateAndOpenSST(50, &table, &block_handles);
731
+ ASSERT_OK(s);
732
+ ASSERT_NE(table, nullptr);
733
+ ASSERT_GE(block_handles.size(), 20);
734
+
735
+ tracking_fs_->ClearReadOps();
736
+
737
+ // Test coalescing with sync reads (uses MultiRead)
738
+ {
739
+ auto job = std::make_shared<IOJob>();
740
+ // Use a subset of adjacent blocks
741
+ std::vector<BlockHandle> adjacent_blocks;
742
+ for (size_t i = 0; i < 10 && i < block_handles.size(); ++i) {
743
+ adjacent_blocks.push_back(block_handles[i]);
744
+ }
745
+ job->block_handles = adjacent_blocks;
746
+ job->table = table.get();
747
+ job->job_options.read_options.async_io = false;
748
+ // Set a large coalesce threshold so all adjacent blocks are combined
749
+ job->job_options.io_coalesce_threshold = 1024 * 1024; // 1MB
750
+
751
+ std::shared_ptr<ReadSet> read_set;
752
+ s = dispatcher->SubmitJob(job, &read_set);
753
+ ASSERT_OK(s);
754
+
755
+ for (size_t i = 0; i < adjacent_blocks.size(); ++i) {
756
+ CachableEntry<Block> block;
757
+ Status read_status = read_set->ReadIndex(i, &block);
758
+ ASSERT_OK(read_status);
759
+ ASSERT_NE(block.GetValue(), nullptr);
760
+ }
761
+
762
+ // With a large coalesce threshold and adjacent blocks, we expect
763
+ // all blocks to be coalesced into a single MultiRead request
764
+ auto read_ops = tracking_fs_->GetReadOps();
765
+ size_t multiread_count = 0;
766
+ size_t total_requests_in_multireads = 0;
767
+ for (const auto& op : read_ops) {
768
+ if (op.type == ReadOp::kMultiRead) {
769
+ multiread_count++;
770
+ total_requests_in_multireads += op.requests.size();
771
+ }
772
+ }
773
+
774
+ // Adjacent blocks should be coalesced into a single read request
775
+ // (assuming they're within the coalesce threshold)
776
+ EXPECT_EQ(multiread_count, 1)
777
+ << "Expected 1 MultiRead call with coalesced blocks";
778
+ EXPECT_EQ(total_requests_in_multireads, 1)
779
+ << "Expected all adjacent blocks to be coalesced into 1 request";
780
+ }
781
+
782
+ tracking_fs_->ClearReadOps();
783
+
784
+ // Test with zero coalesce threshold and non-adjacent blocks
785
+ // Non-adjacent blocks (with gaps) should NOT be coalesced with threshold=0
786
+ {
787
+ // Create new table to avoid cache hits
788
+ std::unique_ptr<BlockBasedTable> table2;
789
+ std::vector<BlockHandle> block_handles2;
790
+ s = CreateAndOpenSST(50, &table2, &block_handles2);
791
+ ASSERT_OK(s);
792
+ ASSERT_GE(block_handles2.size(), 20);
793
+
794
+ tracking_fs_->ClearReadOps();
795
+
796
+ auto job = std::make_shared<IOJob>();
797
+ // Skip every other block to create gaps between requested blocks
798
+ // This ensures there are gaps that won't be bridged with threshold=0
799
+ std::vector<BlockHandle> non_adjacent_blocks;
800
+ for (size_t i = 0;
801
+ i < block_handles2.size() && non_adjacent_blocks.size() < 5; i += 2) {
802
+ non_adjacent_blocks.push_back(block_handles2[i]);
803
+ }
804
+ job->block_handles = non_adjacent_blocks;
805
+ job->table = table2.get();
806
+ job->job_options.read_options.async_io = false;
807
+ // Set zero coalesce threshold - blocks with gaps should not be coalesced
808
+ job->job_options.io_coalesce_threshold = 0;
809
+
810
+ std::shared_ptr<ReadSet> read_set;
811
+ s = dispatcher->SubmitJob(job, &read_set);
812
+ ASSERT_OK(s);
813
+
814
+ for (size_t i = 0; i < non_adjacent_blocks.size(); ++i) {
815
+ CachableEntry<Block> block;
816
+ Status read_status = read_set->ReadIndex(i, &block);
817
+ ASSERT_OK(read_status);
818
+ ASSERT_NE(block.GetValue(), nullptr);
819
+ }
820
+
821
+ // With zero coalesce threshold and non-adjacent blocks (with gaps),
822
+ // each block should be a separate request
823
+ auto read_ops = tracking_fs_->GetReadOps();
824
+ size_t total_requests_in_multireads = 0;
825
+ for (const auto& op : read_ops) {
826
+ if (op.type == ReadOp::kMultiRead) {
827
+ total_requests_in_multireads += op.requests.size();
828
+ }
829
+ }
830
+
831
+ // Each non-adjacent block should be a separate request since there are
832
+ // gaps between them and threshold=0 means no gap tolerance
833
+ EXPECT_EQ(total_requests_in_multireads, non_adjacent_blocks.size())
834
+ << "Expected each non-adjacent block to be a separate request with "
835
+ "zero coalesce threshold";
836
+ }
837
+ }
838
+
839
+ // Test that verifies the read request offsets and lengths match the
840
+ // expected block handles.
841
+ TEST_F(IODispatcherTest, VerifyReadRequestDetails) {
842
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
843
+
844
+ std::unique_ptr<BlockBasedTable> table;
845
+ std::vector<BlockHandle> block_handles;
846
+ Status s = CreateAndOpenSST(10, &table, &block_handles);
847
+ ASSERT_OK(s);
848
+ ASSERT_NE(table, nullptr);
849
+ ASSERT_GE(block_handles.size(), 5);
850
+
851
+ tracking_fs_->ClearReadOps();
852
+
853
+ // Use just a few non-adjacent blocks to avoid coalescing
854
+ std::vector<BlockHandle> test_blocks;
855
+ // Pick every other block to ensure they're not adjacent
856
+ for (size_t i = 0; i < block_handles.size(); i += 2) {
857
+ test_blocks.push_back(block_handles[i]);
858
+ }
859
+
860
+ auto job = std::make_shared<IOJob>();
861
+ job->block_handles = test_blocks;
862
+ job->table = table.get();
863
+ job->job_options.read_options.async_io = false;
864
+ // Small coalesce threshold to minimize coalescing for this test
865
+ job->job_options.io_coalesce_threshold = 0;
866
+
867
+ std::shared_ptr<ReadSet> read_set;
868
+ s = dispatcher->SubmitJob(job, &read_set);
869
+ ASSERT_OK(s);
870
+
871
+ for (size_t i = 0; i < test_blocks.size(); ++i) {
872
+ CachableEntry<Block> block;
873
+ Status read_status = read_set->ReadIndex(i, &block);
874
+ ASSERT_OK(read_status);
875
+ }
876
+
877
+ // Verify the read requests match the block handles
878
+ auto read_ops = tracking_fs_->GetReadOps();
879
+ std::unordered_set<uint64_t> expected_offsets;
880
+ for (const auto& handle : test_blocks) {
881
+ expected_offsets.insert(handle.offset());
882
+ }
883
+
884
+ std::unordered_set<uint64_t> actual_offsets;
885
+ for (const auto& op : read_ops) {
886
+ if (op.type == ReadOp::kMultiRead) {
887
+ for (const auto& req : op.requests) {
888
+ actual_offsets.insert(req.first);
889
+ }
890
+ }
891
+ }
892
+
893
+ // Verify all expected offsets were read
894
+ for (const auto& expected : expected_offsets) {
895
+ EXPECT_TRUE(actual_offsets.count(expected) > 0)
896
+ << "Expected read at offset " << expected << " but it was not found";
897
+ }
898
+ }
899
+
900
+ // Test that memory limiting blocks when the limit is exceeded
901
+ TEST_F(IODispatcherTest, MemoryLimitBlocksWhenExceeded) {
902
+ // Create dispatcher with a small memory limit (1MB)
903
+ IODispatcherOptions opts;
904
+ opts.max_prefetch_memory_bytes = 1 * 1024 * 1024; // 1MB
905
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
906
+
907
+ std::unique_ptr<BlockBasedTable> table;
908
+ std::vector<BlockHandle> block_handles;
909
+ Status s = CreateAndOpenSST(50, &table, &block_handles);
910
+ ASSERT_OK(s);
911
+ ASSERT_GT(block_handles.size(), 0);
912
+
913
+ // Submit a job - should succeed immediately (non-blocking)
914
+ auto job = std::make_shared<IOJob>();
915
+ job->block_handles = block_handles;
916
+ job->table = table.get();
917
+ job->job_options.read_options.async_io = false;
918
+
919
+ std::shared_ptr<ReadSet> read_set;
920
+ s = dispatcher->SubmitJob(job, &read_set);
921
+ ASSERT_OK(s);
922
+ ASSERT_NE(read_set, nullptr);
923
+
924
+ // Read all blocks - they may be read synchronously if prefetch was deferred
925
+ for (size_t i = 0; i < block_handles.size(); ++i) {
926
+ CachableEntry<Block> block;
927
+ Status read_status = read_set->ReadIndex(i, &block);
928
+ ASSERT_OK(read_status);
929
+ ASSERT_NE(block.GetValue(), nullptr);
930
+ }
931
+ }
932
+
933
+ // Test that SubmitJob never blocks even when memory is exhausted
934
+ TEST_F(IODispatcherTest, SubmitJobNeverBlocks) {
935
+ // Create dispatcher with a tiny memory limit
936
+ IODispatcherOptions opts;
937
+ opts.max_prefetch_memory_bytes = 1024; // 1KB - very small
938
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
939
+
940
+ std::unique_ptr<BlockBasedTable> table;
941
+ std::vector<BlockHandle> block_handles;
942
+ Status s = CreateAndOpenSST(50, &table, &block_handles);
943
+ ASSERT_OK(s);
944
+ ASSERT_GT(block_handles.size(), 0);
945
+
946
+ // Submit first job - uses up all memory
947
+ auto job1 = std::make_shared<IOJob>();
948
+ job1->block_handles = block_handles;
949
+ job1->table = table.get();
950
+ job1->job_options.read_options.async_io = false;
951
+
952
+ std::shared_ptr<ReadSet> read_set1;
953
+ s = dispatcher->SubmitJob(job1, &read_set1);
954
+ ASSERT_OK(s); // Should succeed immediately
955
+
956
+ // Submit second job - should also succeed immediately (not block)
957
+ std::unique_ptr<BlockBasedTable> table2;
958
+ std::vector<BlockHandle> block_handles2;
959
+ s = CreateAndOpenSST(30, &table2, &block_handles2);
960
+ ASSERT_OK(s);
961
+
962
+ auto job2 = std::make_shared<IOJob>();
963
+ job2->block_handles = block_handles2;
964
+ job2->table = table2.get();
965
+ job2->job_options.read_options.async_io = false;
966
+
967
+ std::shared_ptr<ReadSet> read_set2;
968
+ s = dispatcher->SubmitJob(job2, &read_set2);
969
+ ASSERT_OK(s); // Should succeed immediately - prefetch is just deferred
970
+
971
+ // Reads work - blocks are fetched synchronously on demand
972
+ for (size_t i = 0; i < block_handles2.size(); ++i) {
973
+ CachableEntry<Block> block;
974
+ Status read_status = read_set2->ReadIndex(i, &block);
975
+ ASSERT_OK(read_status);
976
+ ASSERT_NE(block.GetValue(), nullptr);
977
+ }
978
+ }
979
+
980
+ // Test that releasing blocks triggers pending prefetches
981
+ TEST_F(IODispatcherTest, BlockReleaseTriggersWaitingJob) {
982
+ // Create dispatcher with a small memory limit
983
+ IODispatcherOptions opts;
984
+ opts.max_prefetch_memory_bytes = 100 * 1024; // 100KB
985
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
986
+
987
+ std::unique_ptr<BlockBasedTable> table;
988
+ std::vector<BlockHandle> block_handles;
989
+ Status s = CreateAndOpenSST(30, &table, &block_handles);
990
+ ASSERT_OK(s);
991
+ ASSERT_GT(block_handles.size(), 0);
992
+
993
+ // Submit first job
994
+ auto job1 = std::make_shared<IOJob>();
995
+ job1->block_handles = block_handles;
996
+ job1->table = table.get();
997
+ job1->job_options.read_options.async_io = false;
998
+
999
+ std::shared_ptr<ReadSet> read_set1;
1000
+ s = dispatcher->SubmitJob(job1, &read_set1);
1001
+ ASSERT_OK(s);
1002
+ ASSERT_NE(read_set1, nullptr);
1003
+
1004
+ // Read all blocks from first job
1005
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1006
+ CachableEntry<Block> block;
1007
+ Status read_status = read_set1->ReadIndex(i, &block);
1008
+ ASSERT_OK(read_status);
1009
+ }
1010
+
1011
+ // Submit second job - prefetch will be deferred due to memory limit
1012
+ std::unique_ptr<BlockBasedTable> table2;
1013
+ std::vector<BlockHandle> block_handles2;
1014
+ s = CreateAndOpenSST(20, &table2, &block_handles2);
1015
+ ASSERT_OK(s);
1016
+
1017
+ auto job2 = std::make_shared<IOJob>();
1018
+ job2->block_handles = block_handles2;
1019
+ job2->table = table2.get();
1020
+ job2->job_options.read_options.async_io = false;
1021
+
1022
+ std::shared_ptr<ReadSet> read_set2;
1023
+ s = dispatcher->SubmitJob(job2, &read_set2);
1024
+ ASSERT_OK(s); // Should succeed immediately
1025
+ ASSERT_NE(read_set2, nullptr);
1026
+
1027
+ // Release blocks from first job - this should trigger pending prefetches
1028
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1029
+ read_set1->ReleaseBlock(i);
1030
+ }
1031
+
1032
+ // Read all blocks from second job - should work
1033
+ for (size_t i = 0; i < block_handles2.size(); ++i) {
1034
+ CachableEntry<Block> block;
1035
+ Status read_status = read_set2->ReadIndex(i, &block);
1036
+ ASSERT_OK(read_status);
1037
+ ASSERT_NE(block.GetValue(), nullptr);
1038
+ }
1039
+ }
1040
+
1041
+ // Test that multiple ReadSets share the memory budget
1042
+ TEST_F(IODispatcherTest, MultipleReadSetsShareMemoryBudget) {
1043
+ IODispatcherOptions opts;
1044
+ opts.max_prefetch_memory_bytes = 10 * 1024 * 1024; // 10MB
1045
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1046
+
1047
+ std::vector<std::shared_ptr<ReadSet>> read_sets;
1048
+ std::vector<std::vector<BlockHandle>> all_block_handles;
1049
+
1050
+ // Create and submit multiple jobs
1051
+ for (int i = 0; i < 3; i++) {
1052
+ std::unique_ptr<BlockBasedTable> table;
1053
+ std::vector<BlockHandle> block_handles;
1054
+
1055
+ Status s = CreateAndOpenSST(20 + i * 5, &table, &block_handles);
1056
+ ASSERT_OK(s);
1057
+
1058
+ auto job = std::make_shared<IOJob>();
1059
+ job->block_handles = block_handles;
1060
+ job->table = table.get();
1061
+ job->job_options.read_options.async_io = false;
1062
+ tables_.push_back(std::move(table));
1063
+
1064
+ all_block_handles.push_back(block_handles);
1065
+ std::shared_ptr<ReadSet> read_set;
1066
+ s = dispatcher->SubmitJob(job, &read_set);
1067
+ ASSERT_OK(s);
1068
+ read_sets.push_back(read_set);
1069
+ }
1070
+
1071
+ // Verify all ReadSets can read their blocks
1072
+ for (size_t i = 0; i < read_sets.size(); ++i) {
1073
+ for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
1074
+ CachableEntry<Block> block;
1075
+ Status read_status = read_sets[i]->ReadIndex(j, &block);
1076
+ ASSERT_OK(read_status);
1077
+ ASSERT_NE(block.GetValue(), nullptr);
1078
+ }
1079
+ }
1080
+
1081
+ // Release all blocks from first ReadSet
1082
+ for (size_t i = 0; i < all_block_handles[0].size(); ++i) {
1083
+ read_sets[0]->ReleaseBlock(i);
1084
+ }
1085
+
1086
+ // Create another job - should work because first ReadSet released memory
1087
+ std::unique_ptr<BlockBasedTable> table_new;
1088
+ std::vector<BlockHandle> block_handles_new;
1089
+ Status s = CreateAndOpenSST(25, &table_new, &block_handles_new);
1090
+ ASSERT_OK(s);
1091
+
1092
+ auto job_new = std::make_shared<IOJob>();
1093
+ job_new->block_handles = block_handles_new;
1094
+ job_new->table = table_new.get();
1095
+ job_new->job_options.read_options.async_io = false;
1096
+
1097
+ std::shared_ptr<ReadSet> read_set_new;
1098
+ s = dispatcher->SubmitJob(job_new, &read_set_new);
1099
+ ASSERT_OK(s);
1100
+ ASSERT_NE(read_set_new, nullptr);
1101
+
1102
+ for (size_t i = 0; i < block_handles_new.size(); ++i) {
1103
+ CachableEntry<Block> block;
1104
+ Status read_status = read_set_new->ReadIndex(i, &block);
1105
+ ASSERT_OK(read_status);
1106
+ ASSERT_NE(block.GetValue(), nullptr);
1107
+ }
1108
+ }
1109
+
1110
+ // Test that no memory limiting is applied when max_prefetch_memory_bytes is 0
1111
+ TEST_F(IODispatcherTest, NoMemoryLimitWhenZero) {
1112
+ IODispatcherOptions opts;
1113
+ opts.max_prefetch_memory_bytes = 0; // No limit
1114
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1115
+
1116
+ std::unique_ptr<BlockBasedTable> table;
1117
+ std::vector<BlockHandle> block_handles;
1118
+ Status s = CreateAndOpenSST(50, &table, &block_handles);
1119
+ ASSERT_OK(s);
1120
+
1121
+ auto job = std::make_shared<IOJob>();
1122
+ job->block_handles = block_handles;
1123
+ job->table = table.get();
1124
+ job->job_options.read_options.async_io = false;
1125
+
1126
+ std::shared_ptr<ReadSet> read_set;
1127
+ s = dispatcher->SubmitJob(job, &read_set);
1128
+ ASSERT_OK(s);
1129
+ ASSERT_NE(read_set, nullptr);
1130
+
1131
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1132
+ CachableEntry<Block> block;
1133
+ Status read_status = read_set->ReadIndex(i, &block);
1134
+ ASSERT_OK(read_status);
1135
+ ASSERT_NE(block.GetValue(), nullptr);
1136
+ }
1137
+ }
1138
+
1139
+ // Test memory release on ReadSet destruction triggers pending prefetches
1140
+ TEST_F(IODispatcherTest, MemoryReleasedOnReadSetDestruction) {
1141
+ IODispatcherOptions opts;
1142
+ opts.max_prefetch_memory_bytes = 100 * 1024; // 100KB
1143
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1144
+
1145
+ // Create table outside the scope so it outlives the ReadSet
1146
+ std::unique_ptr<BlockBasedTable> table;
1147
+ std::vector<BlockHandle> block_handles;
1148
+ Status s = CreateAndOpenSST(30, &table, &block_handles);
1149
+ ASSERT_OK(s);
1150
+
1151
+ // Second table - created now so it's available after first ReadSet is
1152
+ // destroyed
1153
+ std::unique_ptr<BlockBasedTable> table2;
1154
+ std::vector<BlockHandle> block_handles2;
1155
+ s = CreateAndOpenSST(30, &table2, &block_handles2);
1156
+ ASSERT_OK(s);
1157
+
1158
+ std::shared_ptr<ReadSet> read_set2;
1159
+
1160
+ {
1161
+ auto job = std::make_shared<IOJob>();
1162
+ job->block_handles = block_handles;
1163
+ job->table = table.get();
1164
+ job->job_options.read_options.async_io = false;
1165
+
1166
+ std::shared_ptr<ReadSet> read_set;
1167
+ s = dispatcher->SubmitJob(job, &read_set);
1168
+ ASSERT_OK(s);
1169
+ ASSERT_NE(read_set, nullptr);
1170
+
1171
+ // Submit second job while first is still alive - prefetch will be deferred
1172
+ auto job2 = std::make_shared<IOJob>();
1173
+ job2->block_handles = block_handles2;
1174
+ job2->table = table2.get();
1175
+ job2->job_options.read_options.async_io = false;
1176
+
1177
+ s = dispatcher->SubmitJob(job2, &read_set2);
1178
+ ASSERT_OK(s); // Should succeed immediately
1179
+ ASSERT_NE(read_set2, nullptr);
1180
+
1181
+ // First ReadSet goes out of scope here and should release all memory,
1182
+ // which triggers pending prefetches for second ReadSet
1183
+ }
1184
+
1185
+ // Read all blocks from second job - should work because first ReadSet
1186
+ // released its memory on destruction
1187
+ for (size_t i = 0; i < block_handles2.size(); ++i) {
1188
+ CachableEntry<Block> block;
1189
+ Status read_status = read_set2->ReadIndex(i, &block);
1190
+ ASSERT_OK(read_status);
1191
+ ASSERT_NE(block.GetValue(), nullptr);
1192
+ }
1193
+ }
1194
+
1195
+ // Test that partial prefetch dispatches as many blocks as memory allows
1196
+ // and queues the rest for later dispatch
1197
+ TEST_F(IODispatcherTest, PartialPrefetchDispatchesWhatFits) {
1198
+ // Skip this test if io_uring is not available since partial prefetch
1199
+ // only applies to async IO
1200
+ if (!kIOUringPresent) {
1201
+ return; // io_uring not available, skip async IO test
1202
+ }
1203
+
1204
+ // Create dispatcher with memory limit that allows only some blocks
1205
+ // Each block is ~16KB, so 50KB allows roughly 3 blocks
1206
+ IODispatcherOptions opts;
1207
+ opts.max_prefetch_memory_bytes = 50 * 1024; // 50KB
1208
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1209
+
1210
+ std::unique_ptr<BlockBasedTable> table;
1211
+ std::vector<BlockHandle> block_handles;
1212
+ // Create 10 blocks - only ~3 should fit in memory
1213
+ Status s = CreateAndOpenSST(10, &table, &block_handles);
1214
+ ASSERT_OK(s);
1215
+ ASSERT_GE(block_handles.size(), 5);
1216
+
1217
+ // Use sync point to count blocks dispatched during SubmitJob
1218
+ size_t blocks_dispatched_on_submit = 0;
1219
+ SyncPoint::GetInstance()->SetCallBack(
1220
+ "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
1221
+ auto* indices = static_cast<std::vector<size_t>*>(arg);
1222
+ blocks_dispatched_on_submit += indices->size();
1223
+ });
1224
+ SyncPoint::GetInstance()->EnableProcessing();
1225
+
1226
+ auto job = std::make_shared<IOJob>();
1227
+ job->block_handles = block_handles;
1228
+ job->table = table.get();
1229
+ job->job_options.read_options.async_io = true; // Use async IO
1230
+
1231
+ std::shared_ptr<ReadSet> read_set;
1232
+ s = dispatcher->SubmitJob(job, &read_set);
1233
+ ASSERT_OK(s);
1234
+ ASSERT_NE(read_set, nullptr);
1235
+
1236
+ // With partial prefetch, we expect SOME blocks to have been dispatched
1237
+ // (the ones that fit in memory), but not ALL blocks
1238
+ // This is the key assertion: partial prefetch means > 0 blocks dispatched
1239
+ // even though total memory needed exceeds the limit
1240
+ EXPECT_GT(blocks_dispatched_on_submit, 0)
1241
+ << "Expected some blocks to be dispatched with partial prefetch";
1242
+ EXPECT_LT(blocks_dispatched_on_submit, block_handles.size())
1243
+ << "Expected not all blocks to be dispatched (memory limit should apply)";
1244
+
1245
+ SyncPoint::GetInstance()->DisableProcessing();
1246
+ SyncPoint::GetInstance()->ClearAllCallBacks();
1247
+
1248
+ // Now read all blocks - remaining blocks will be fetched on demand
1249
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1250
+ CachableEntry<Block> block;
1251
+ Status read_status = read_set->ReadIndex(i, &block);
1252
+ ASSERT_OK(read_status);
1253
+ ASSERT_NE(block.GetValue(), nullptr);
1254
+ }
1255
+
1256
+ // Verify all blocks were ultimately read
1257
+ uint64_t total_reads = read_set->GetNumSyncReads() +
1258
+ read_set->GetNumAsyncReads() +
1259
+ read_set->GetNumCacheHits();
1260
+ EXPECT_EQ(total_reads, block_handles.size());
1261
+ }
1262
+
1263
+ // Test that earlier block indices are prioritized in partial prefetch
1264
+ TEST_F(IODispatcherTest, PartialPrefetchPrioritizesEarlierIndices) {
1265
+ // Skip this test if io_uring is not available
1266
+ if (!kIOUringPresent) {
1267
+ return; // io_uring not available, skip async IO test
1268
+ }
1269
+
1270
+ // Create dispatcher with memory limit that allows only 1-2 blocks
1271
+ IODispatcherOptions opts;
1272
+ opts.max_prefetch_memory_bytes = 20 * 1024; // 20KB - room for ~1 block
1273
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1274
+
1275
+ std::unique_ptr<BlockBasedTable> table;
1276
+ std::vector<BlockHandle> block_handles;
1277
+ Status s = CreateAndOpenSST(10, &table, &block_handles);
1278
+ ASSERT_OK(s);
1279
+ ASSERT_GE(block_handles.size(), 5);
1280
+
1281
+ tracking_fs_->ClearReadOps();
1282
+
1283
+ auto job = std::make_shared<IOJob>();
1284
+ job->block_handles = block_handles;
1285
+ job->table = table.get();
1286
+ job->job_options.read_options.async_io = true;
1287
+
1288
+ std::shared_ptr<ReadSet> read_set;
1289
+ s = dispatcher->SubmitJob(job, &read_set);
1290
+ ASSERT_OK(s);
1291
+
1292
+ // Get the async reads that were dispatched
1293
+ auto read_ops = tracking_fs_->GetReadOps();
1294
+
1295
+ // Find the offset of the first async read
1296
+ uint64_t first_async_offset = UINT64_MAX;
1297
+ for (const auto& op : read_ops) {
1298
+ if (op.type == ReadOp::kReadAsync && !op.requests.empty()) {
1299
+ first_async_offset = std::min(first_async_offset, op.requests[0].first);
1300
+ }
1301
+ }
1302
+
1303
+ // The first async read should be for the first block (lowest offset)
1304
+ // This verifies that earlier indices are prioritized
1305
+ if (first_async_offset != UINT64_MAX) {
1306
+ EXPECT_EQ(first_async_offset, block_handles[0].offset())
1307
+ << "Expected first async read to be for the first block (earliest "
1308
+ "index)";
1309
+ }
1310
+
1311
+ // Read all blocks to complete the test
1312
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1313
+ CachableEntry<Block> block;
1314
+ Status read_status = read_set->ReadIndex(i, &block);
1315
+ ASSERT_OK(read_status);
1316
+ ASSERT_NE(block.GetValue(), nullptr);
1317
+ }
1318
+ }
1319
+
1320
+ // Test that blocks larger than the memory budget are excluded from prefetch
1321
+ // and fall back to synchronous read
1322
+ TEST_F(IODispatcherTest, OversizedBlocksFallbackToSyncRead) {
1323
+ // Skip this test if io_uring is not available since we need async IO
1324
+ if (!kIOUringPresent) {
1325
+ return;
1326
+ }
1327
+
1328
+ std::unique_ptr<BlockBasedTable> table;
1329
+ std::vector<BlockHandle> block_handles;
1330
+ Status s = CreateAndOpenSST(10, &table, &block_handles);
1331
+ ASSERT_OK(s);
1332
+ ASSERT_GE(block_handles.size(), 3);
1333
+
1334
+ // Calculate the size of a single block
1335
+ size_t single_block_size =
1336
+ BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
1337
+
1338
+ // Create dispatcher with memory limit smaller than a single block
1339
+ // This means ALL blocks are "oversized" and should fall back to sync read
1340
+ IODispatcherOptions opts;
1341
+ opts.max_prefetch_memory_bytes = single_block_size / 2; // Half a block
1342
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1343
+
1344
+ // Track dispatches - with oversized blocks, nothing should be dispatched
1345
+ size_t blocks_dispatched = 0;
1346
+ SyncPoint::GetInstance()->SetCallBack(
1347
+ "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
1348
+ auto* indices = static_cast<std::vector<size_t>*>(arg);
1349
+ blocks_dispatched += indices->size();
1350
+ });
1351
+ SyncPoint::GetInstance()->EnableProcessing();
1352
+
1353
+ auto job = std::make_shared<IOJob>();
1354
+ job->block_handles = block_handles;
1355
+ job->table = table.get();
1356
+ job->job_options.read_options.async_io = true;
1357
+
1358
+ std::shared_ptr<ReadSet> read_set;
1359
+ s = dispatcher->SubmitJob(job, &read_set);
1360
+ ASSERT_OK(s);
1361
+ ASSERT_NE(read_set, nullptr);
1362
+
1363
+ // No blocks should have been dispatched since they're all oversized
1364
+ EXPECT_EQ(blocks_dispatched, 0)
1365
+ << "Expected no blocks to be dispatched when all blocks are oversized";
1366
+
1367
+ SyncPoint::GetInstance()->DisableProcessing();
1368
+ SyncPoint::GetInstance()->ClearAllCallBacks();
1369
+
1370
+ // All blocks should still be readable via sync fallback
1371
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1372
+ CachableEntry<Block> block;
1373
+ Status read_status = read_set->ReadIndex(i, &block);
1374
+ ASSERT_OK(read_status);
1375
+ ASSERT_NE(block.GetValue(), nullptr);
1376
+ }
1377
+
1378
+ // All reads should be sync since blocks couldn't be prefetched
1379
+ EXPECT_GT(read_set->GetNumSyncReads(), 0)
1380
+ << "Expected sync reads for oversized blocks";
1381
+ }
1382
+
1383
+ // Test that reading blocks before prefetch dispatch correctly updates
1384
+ // memory accounting for coalesced groups
1385
+ TEST_F(IODispatcherTest, PartialReadsUpdateCoalescedGroups) {
1386
+ // Skip this test if io_uring is not available
1387
+ if (!kIOUringPresent) {
1388
+ return;
1389
+ }
1390
+
1391
+ // Create dispatcher with memory limit that allows only some blocks
1392
+ IODispatcherOptions opts;
1393
+ opts.max_prefetch_memory_bytes = 50 * 1024; // 50KB
1394
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1395
+
1396
+ std::unique_ptr<BlockBasedTable> table;
1397
+ std::vector<BlockHandle> block_handles;
1398
+ Status s = CreateAndOpenSST(20, &table, &block_handles);
1399
+ ASSERT_OK(s);
1400
+ ASSERT_GE(block_handles.size(), 10);
1401
+
1402
+ auto job = std::make_shared<IOJob>();
1403
+ job->block_handles = block_handles;
1404
+ job->table = table.get();
1405
+ job->job_options.read_options.async_io = true;
1406
+
1407
+ std::shared_ptr<ReadSet> read_set;
1408
+ s = dispatcher->SubmitJob(job, &read_set);
1409
+ ASSERT_OK(s);
1410
+ ASSERT_NE(read_set, nullptr);
1411
+
1412
+ // Read some blocks directly (simulating on-demand access before prefetch)
1413
+ // This removes them from pending and should update coalesced group accounting
1414
+ for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
1415
+ CachableEntry<Block> block;
1416
+ Status read_status = read_set->ReadIndex(i, &block);
1417
+ ASSERT_OK(read_status);
1418
+ ASSERT_NE(block.GetValue(), nullptr);
1419
+ }
1420
+
1421
+ // Release the blocks we read - this frees memory
1422
+ for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
1423
+ read_set->ReleaseBlock(i);
1424
+ }
1425
+
1426
+ // Now read the remaining blocks - these should work correctly
1427
+ // The key test: memory accounting should be correct even though some blocks
1428
+ // were removed from pending groups before dispatch
1429
+ for (size_t i = 5; i < block_handles.size(); ++i) {
1430
+ CachableEntry<Block> block;
1431
+ Status read_status = read_set->ReadIndex(i, &block);
1432
+ ASSERT_OK(read_status) << "Failed to read block " << i;
1433
+ ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
1434
+ }
1435
+
1436
+ // Verify all remaining blocks were read successfully
1437
+ uint64_t total_reads = read_set->GetNumSyncReads() +
1438
+ read_set->GetNumAsyncReads() +
1439
+ read_set->GetNumCacheHits();
1440
+ // We read 5 blocks initially, then the remaining blocks
1441
+ EXPECT_GE(total_reads, block_handles.size() - 5)
1442
+ << "Expected at least the remaining blocks to be counted";
1443
+ }
1444
+
1445
+ // Test that a mix of oversized and normal blocks works correctly
1446
+ TEST_F(IODispatcherTest, MixedOversizedAndNormalBlocks) {
1447
+ // Skip this test if io_uring is not available
1448
+ if (!kIOUringPresent) {
1449
+ return;
1450
+ }
1451
+
1452
+ std::unique_ptr<BlockBasedTable> table;
1453
+ std::vector<BlockHandle> block_handles;
1454
+ Status s = CreateAndOpenSST(10, &table, &block_handles);
1455
+ ASSERT_OK(s);
1456
+ ASSERT_GE(block_handles.size(), 5);
1457
+
1458
+ // Calculate the size of a typical block
1459
+ size_t typical_block_size =
1460
+ BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
1461
+
1462
+ // Create dispatcher with memory limit that allows exactly 2 typical blocks
1463
+ // This means groups of 3+ blocks become "oversized" as a group
1464
+ IODispatcherOptions opts;
1465
+ opts.max_prefetch_memory_bytes = typical_block_size * 2;
1466
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1467
+
1468
+ auto job = std::make_shared<IOJob>();
1469
+ job->block_handles = block_handles;
1470
+ job->table = table.get();
1471
+ job->job_options.read_options.async_io = true;
1472
+
1473
+ std::shared_ptr<ReadSet> read_set;
1474
+ s = dispatcher->SubmitJob(job, &read_set);
1475
+ ASSERT_OK(s);
1476
+ ASSERT_NE(read_set, nullptr);
1477
+
1478
+ // All blocks should be readable regardless of prefetch status
1479
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1480
+ CachableEntry<Block> block;
1481
+ Status read_status = read_set->ReadIndex(i, &block);
1482
+ ASSERT_OK(read_status) << "Failed to read block " << i;
1483
+ ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
1484
+ }
1485
+
1486
+ // Verify total reads match
1487
+ uint64_t total_reads = read_set->GetNumSyncReads() +
1488
+ read_set->GetNumAsyncReads() +
1489
+ read_set->GetNumCacheHits();
1490
+ EXPECT_EQ(total_reads, block_handles.size());
1491
+ }
1492
+
1493
+ // Test that memory is properly accounted when groups are partially consumed
1494
+ TEST_F(IODispatcherTest, MemoryAccountingWithPartialGroupConsumption) {
1495
+ // Skip this test if io_uring is not available
1496
+ if (!kIOUringPresent) {
1497
+ return;
1498
+ }
1499
+
1500
+ // Create dispatcher with a specific memory limit
1501
+ IODispatcherOptions opts;
1502
+ opts.max_prefetch_memory_bytes = 100 * 1024; // 100KB
1503
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1504
+
1505
+ std::unique_ptr<BlockBasedTable> table;
1506
+ std::vector<BlockHandle> block_handles;
1507
+ Status s = CreateAndOpenSST(30, &table, &block_handles);
1508
+ ASSERT_OK(s);
1509
+ ASSERT_GE(block_handles.size(), 10);
1510
+
1511
+ auto job = std::make_shared<IOJob>();
1512
+ job->block_handles = block_handles;
1513
+ job->table = table.get();
1514
+ job->job_options.read_options.async_io = true;
1515
+
1516
+ std::shared_ptr<ReadSet> read_set;
1517
+ s = dispatcher->SubmitJob(job, &read_set);
1518
+ ASSERT_OK(s);
1519
+ ASSERT_NE(read_set, nullptr);
1520
+
1521
+ // Read blocks one at a time and release them
1522
+ // This tests that RemoveFromPending correctly updates pending state
1523
+ // and that TryDispatchPendingPrefetches filters correctly
1524
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1525
+ CachableEntry<Block> block;
1526
+ Status read_status = read_set->ReadIndex(i, &block);
1527
+ ASSERT_OK(read_status) << "Failed to read block " << i;
1528
+ ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
1529
+
1530
+ // Release the block immediately after reading
1531
+ read_set->ReleaseBlock(i);
1532
+ }
1533
+
1534
+ // Verify total reads match
1535
+ uint64_t total_reads = read_set->GetNumSyncReads() +
1536
+ read_set->GetNumAsyncReads() +
1537
+ read_set->GetNumCacheHits();
1538
+ EXPECT_EQ(total_reads, block_handles.size());
1539
+ }
1540
+
1541
+ // Test that sync prefetching respects memory limits
1542
+ TEST_F(IODispatcherTest, SyncPrefetchWithMemoryLimit) {
1543
+ // Create dispatcher with a small memory limit
1544
+ IODispatcherOptions opts;
1545
+ opts.max_prefetch_memory_bytes = 50 * 1024; // 50KB
1546
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1547
+
1548
+ std::unique_ptr<BlockBasedTable> table;
1549
+ std::vector<BlockHandle> block_handles;
1550
+ Status s = CreateAndOpenSST(20, &table, &block_handles);
1551
+ ASSERT_OK(s);
1552
+ ASSERT_GE(block_handles.size(), 10);
1553
+
1554
+ auto job = std::make_shared<IOJob>();
1555
+ job->block_handles = block_handles;
1556
+ job->table = table.get();
1557
+ job->job_options.read_options.async_io = false; // Sync IO
1558
+
1559
+ std::shared_ptr<ReadSet> read_set;
1560
+ s = dispatcher->SubmitJob(job, &read_set);
1561
+ ASSERT_OK(s);
1562
+ ASSERT_NE(read_set, nullptr);
1563
+
1564
+ // All blocks should be readable even with memory limits
1565
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1566
+ CachableEntry<Block> block;
1567
+ Status read_status = read_set->ReadIndex(i, &block);
1568
+ ASSERT_OK(read_status) << "Failed to read block " << i;
1569
+ ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
1570
+ }
1571
+
1572
+ // Verify all were sync reads
1573
+ EXPECT_GT(read_set->GetNumSyncReads(), 0)
1574
+ << "Expected sync reads with async_io=false";
1575
+ EXPECT_EQ(read_set->GetNumAsyncReads(), 0)
1576
+ << "Expected no async reads with async_io=false";
1577
+ }
1578
+
1579
+ // Test that oversized blocks work correctly with sync IO
1580
+ TEST_F(IODispatcherTest, OversizedBlocksWithSyncIO) {
1581
+ std::unique_ptr<BlockBasedTable> table;
1582
+ std::vector<BlockHandle> block_handles;
1583
+ Status s = CreateAndOpenSST(10, &table, &block_handles);
1584
+ ASSERT_OK(s);
1585
+ ASSERT_GE(block_handles.size(), 3);
1586
+
1587
+ // Calculate the size of a single block
1588
+ size_t single_block_size =
1589
+ BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
1590
+
1591
+ // Create dispatcher with memory limit smaller than a single block
1592
+ // This means ALL blocks are "oversized"
1593
+ IODispatcherOptions opts;
1594
+ opts.max_prefetch_memory_bytes = single_block_size / 2; // Half a block
1595
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1596
+
1597
+ auto job = std::make_shared<IOJob>();
1598
+ job->block_handles = block_handles;
1599
+ job->table = table.get();
1600
+ job->job_options.read_options.async_io = false; // Sync IO
1601
+
1602
+ std::shared_ptr<ReadSet> read_set;
1603
+ s = dispatcher->SubmitJob(job, &read_set);
1604
+ ASSERT_OK(s);
1605
+ ASSERT_NE(read_set, nullptr);
1606
+
1607
+ // All blocks should still be readable via sync fallback
1608
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1609
+ CachableEntry<Block> block;
1610
+ Status read_status = read_set->ReadIndex(i, &block);
1611
+ ASSERT_OK(read_status) << "Failed to read block " << i;
1612
+ ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
1613
+ }
1614
+
1615
+ // All reads should be sync
1616
+ EXPECT_GT(read_set->GetNumSyncReads(), 0)
1617
+ << "Expected sync reads for oversized blocks";
1618
+ }
1619
+
1620
+ // Test that a single block larger than total memory budget still works
1621
+ TEST_F(IODispatcherTest, SingleBlockLargerThanTotalMemory) {
1622
+ std::unique_ptr<BlockBasedTable> table;
1623
+ std::vector<BlockHandle> block_handles;
1624
+ Status s = CreateAndOpenSST(5, &table, &block_handles);
1625
+ ASSERT_OK(s);
1626
+ ASSERT_GE(block_handles.size(), 1);
1627
+
1628
+ // Set memory limit to 1 byte - smaller than any block
1629
+ IODispatcherOptions opts;
1630
+ opts.max_prefetch_memory_bytes = 1;
1631
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1632
+
1633
+ // Test with both sync and async modes
1634
+ for (bool async : {false, true}) {
1635
+ // Skip async if io_uring not available
1636
+ if (async && !kIOUringPresent) {
1637
+ continue;
1638
+ }
1639
+
1640
+ auto job = std::make_shared<IOJob>();
1641
+ job->block_handles = block_handles;
1642
+ job->table = table.get();
1643
+ job->job_options.read_options.async_io = async;
1644
+
1645
+ std::shared_ptr<ReadSet> read_set;
1646
+ s = dispatcher->SubmitJob(job, &read_set);
1647
+ ASSERT_OK(s) << "SubmitJob failed with async=" << async;
1648
+ ASSERT_NE(read_set, nullptr);
1649
+
1650
+ // All blocks should be readable
1651
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1652
+ CachableEntry<Block> block;
1653
+ Status read_status = read_set->ReadIndex(i, &block);
1654
+ ASSERT_OK(read_status)
1655
+ << "Failed to read block " << i << " with async=" << async;
1656
+ ASSERT_NE(block.GetValue(), nullptr)
1657
+ << "Block " << i << " is null with async=" << async;
1658
+ }
1659
+ }
1660
+ }
1661
+
1662
+ // Test that sync prefetching defers later groups and dispatches them
1663
+ // when memory is released
1664
+ TEST_F(IODispatcherTest, SyncPrefetchDefersAndDispatchesLaterGroups) {
1665
+ std::unique_ptr<BlockBasedTable> table;
1666
+ std::vector<BlockHandle> block_handles;
1667
+ // Create 10+ blocks so we have enough to test deferred dispatch
1668
+ Status s = CreateAndOpenSST(20, &table, &block_handles);
1669
+ ASSERT_OK(s);
1670
+ ASSERT_GE(block_handles.size(), 10);
1671
+
1672
+ // Calculate typical block size
1673
+ size_t typical_block_size =
1674
+ BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
1675
+
1676
+ // Set memory limit to fit approximately 3 blocks
1677
+ // This should cause groups to be split and some deferred
1678
+ IODispatcherOptions opts;
1679
+ opts.max_prefetch_memory_bytes = typical_block_size * 3;
1680
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1681
+
1682
+ // Track dispatch calls
1683
+ std::vector<size_t> dispatch_counts;
1684
+ SyncPoint::GetInstance()->SetCallBack(
1685
+ "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
1686
+ auto* indices = static_cast<std::vector<size_t>*>(arg);
1687
+ dispatch_counts.push_back(indices->size());
1688
+ });
1689
+ SyncPoint::GetInstance()->EnableProcessing();
1690
+
1691
+ auto job = std::make_shared<IOJob>();
1692
+ job->block_handles = block_handles;
1693
+ job->table = table.get();
1694
+ job->job_options.read_options.async_io = false; // Sync IO
1695
+
1696
+ std::shared_ptr<ReadSet> read_set;
1697
+ s = dispatcher->SubmitJob(job, &read_set);
1698
+ ASSERT_OK(s);
1699
+ ASSERT_NE(read_set, nullptr);
1700
+
1701
+ // After SubmitJob, some blocks should have been dispatched (first group)
1702
+ // and remaining groups should be queued
1703
+ size_t initial_dispatch_count = dispatch_counts.size();
1704
+ EXPECT_GT(initial_dispatch_count, 0)
1705
+ << "Expected at least one dispatch during SubmitJob";
1706
+
1707
+ // Read and release first few blocks - this should trigger deferred dispatch
1708
+ for (size_t i = 0; i < 3 && i < block_handles.size(); ++i) {
1709
+ CachableEntry<Block> block;
1710
+ Status read_status = read_set->ReadIndex(i, &block);
1711
+ ASSERT_OK(read_status);
1712
+ ASSERT_NE(block.GetValue(), nullptr);
1713
+ // Release to free memory
1714
+ read_set->ReleaseBlock(i);
1715
+ }
1716
+
1717
+ // After releasing blocks, more dispatches should have occurred
1718
+ // as the pending queue gets processed
1719
+ size_t dispatch_count_after_release = dispatch_counts.size();
1720
+ EXPECT_GE(dispatch_count_after_release, initial_dispatch_count)
1721
+ << "Expected more dispatches after releasing blocks";
1722
+
1723
+ SyncPoint::GetInstance()->DisableProcessing();
1724
+ SyncPoint::GetInstance()->ClearAllCallBacks();
1725
+
1726
+ // All remaining blocks should still be readable
1727
+ for (size_t i = 3; i < block_handles.size(); ++i) {
1728
+ CachableEntry<Block> block;
1729
+ Status read_status = read_set->ReadIndex(i, &block);
1730
+ ASSERT_OK(read_status) << "Failed to read block " << i;
1731
+ ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
1732
+ }
1733
+ }
1734
+
1735
+ // Test that coalesced groups are properly split based on memory budget
1736
+ TEST_F(IODispatcherTest, CoalescedGroupsSplitByMemoryBudget) {
1737
+ std::unique_ptr<BlockBasedTable> table;
1738
+ std::vector<BlockHandle> block_handles;
1739
+ Status s = CreateAndOpenSST(15, &table, &block_handles);
1740
+ ASSERT_OK(s);
1741
+ ASSERT_GE(block_handles.size(), 10);
1742
+
1743
+ // Calculate typical block size
1744
+ size_t typical_block_size =
1745
+ BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
1746
+
1747
+ // Set memory limit to fit exactly 5 blocks
1748
+ // With 10+ blocks, we should get at least 2 groups
1749
+ IODispatcherOptions opts;
1750
+ opts.max_prefetch_memory_bytes = typical_block_size * 5;
1751
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1752
+
1753
+ // Track how many blocks are in each dispatch call
1754
+ std::vector<size_t> blocks_per_dispatch;
1755
+ SyncPoint::GetInstance()->SetCallBack(
1756
+ "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
1757
+ auto* indices = static_cast<std::vector<size_t>*>(arg);
1758
+ blocks_per_dispatch.push_back(indices->size());
1759
+ });
1760
+ SyncPoint::GetInstance()->EnableProcessing();
1761
+
1762
+ auto job = std::make_shared<IOJob>();
1763
+ job->block_handles = block_handles;
1764
+ job->table = table.get();
1765
+ job->job_options.read_options.async_io = false;
1766
+
1767
+ std::shared_ptr<ReadSet> read_set;
1768
+ s = dispatcher->SubmitJob(job, &read_set);
1769
+ ASSERT_OK(s);
1770
+
1771
+ // First dispatch should have at most 5 blocks (memory limit)
1772
+ ASSERT_GT(blocks_per_dispatch.size(), 0);
1773
+ EXPECT_LE(blocks_per_dispatch[0], 5)
1774
+ << "First dispatch should be limited by memory budget";
1775
+
1776
+ // Read and release all blocks to trigger remaining dispatches
1777
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1778
+ CachableEntry<Block> block;
1779
+ Status read_status = read_set->ReadIndex(i, &block);
1780
+ ASSERT_OK(read_status);
1781
+ read_set->ReleaseBlock(i);
1782
+ }
1783
+
1784
+ SyncPoint::GetInstance()->DisableProcessing();
1785
+ SyncPoint::GetInstance()->ClearAllCallBacks();
1786
+
1787
+ // Verify each dispatch was limited by memory budget
1788
+ for (size_t i = 0; i < blocks_per_dispatch.size(); ++i) {
1789
+ EXPECT_LE(blocks_per_dispatch[i], 5)
1790
+ << "Dispatch " << i << " exceeded memory budget";
1791
+ }
1792
+ }
1793
+
1794
+ // Regression tests for a bug where ReadIndex moved values out of
1795
+ // pinned_blocks_ via std::move, but neither ReleaseBlock() nor the destructor
1796
+ // released memory accounting because they checked pinned_blocks_.GetValue()
1797
+ // which was null after the move.
1798
+ // Tests run with both sync and async IO modes to cover Case 1 and Case 2
1799
+ // in ReadIndex().
1800
+ TEST_F(IODispatcherTest, MemoryReleasedAfterReadIndexThenReleaseBlock) {
1801
+ for (bool async : {false, true}) {
1802
+ // Skip async if io_uring not available
1803
+ if (async && !kIOUringPresent) {
1804
+ continue;
1805
+ }
1806
+ SCOPED_TRACE("async_io=" + std::to_string(async));
1807
+
1808
+ auto stats = CreateDBStatistics();
1809
+ IODispatcherOptions opts;
1810
+ opts.max_prefetch_memory_bytes = 100 * 1024; // 100KB
1811
+ opts.statistics = stats.get();
1812
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1813
+
1814
+ std::unique_ptr<BlockBasedTable> table;
1815
+ std::vector<BlockHandle> block_handles;
1816
+ Status s = CreateAndOpenSST(20, &table, &block_handles);
1817
+ ASSERT_OK(s);
1818
+ ASSERT_GT(block_handles.size(), 0);
1819
+
1820
+ auto job = std::make_shared<IOJob>();
1821
+ job->block_handles = block_handles;
1822
+ job->table = table.get();
1823
+ job->job_options.read_options.async_io = async;
1824
+
1825
+ std::shared_ptr<ReadSet> read_set;
1826
+ s = dispatcher->SubmitJob(job, &read_set);
1827
+ ASSERT_OK(s);
1828
+ ASSERT_NE(read_set, nullptr);
1829
+
1830
+ // Some memory should have been granted for prefetch
1831
+ ASSERT_GT(stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED), 0);
1832
+
1833
+ // Read all blocks — ReadIndex moves values out of pinned_blocks_.
1834
+ // This also triggers TryDispatchPendingPrefetches as memory is released,
1835
+ // which acquires more memory for pending groups. So granted grows during
1836
+ // this loop.
1837
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1838
+ CachableEntry<Block> block;
1839
+ ASSERT_OK(read_set->ReadIndex(i, &block));
1840
+ ASSERT_NE(block.GetValue(), nullptr);
1841
+ }
1842
+
1843
+ // Release all blocks — should be a no-op for memory accounting since
1844
+ // ReadIndex already released memory when moving values out
1845
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1846
+ read_set->ReleaseBlock(i);
1847
+ }
1848
+
1849
+ // Read both counters after all operations complete, since
1850
+ // TryDispatchPendingPrefetches during ReadIndex may have granted additional
1851
+ // memory for pending groups
1852
+ uint64_t granted = stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED);
1853
+ uint64_t released = stats->getTickerCount(PREFETCH_MEMORY_BYTES_RELEASED);
1854
+ // With the bug, released < granted because ReleaseBlock skips
1855
+ // ReleaseMemory when pinned_blocks_ value was already moved out
1856
+ EXPECT_EQ(released, granted);
1857
+ }
1858
+ }
1859
+
1860
+ // Test that ReadSet destructor releases memory for blocks that were read
1861
+ // via ReadIndex but never explicitly released via ReleaseBlock.
1862
+ TEST_F(IODispatcherTest, DestructorReleasesMemoryAfterReadIndex) {
1863
+ for (bool async : {false, true}) {
1864
+ // Skip async if io_uring not available
1865
+ if (async && !kIOUringPresent) {
1866
+ continue;
1867
+ }
1868
+ SCOPED_TRACE("async_io=" + std::to_string(async));
1869
+
1870
+ auto stats = CreateDBStatistics();
1871
+ IODispatcherOptions opts;
1872
+ opts.max_prefetch_memory_bytes = 100 * 1024; // 100KB
1873
+ opts.statistics = stats.get();
1874
+ std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
1875
+
1876
+ std::unique_ptr<BlockBasedTable> table;
1877
+ std::vector<BlockHandle> block_handles;
1878
+ Status s = CreateAndOpenSST(20, &table, &block_handles);
1879
+ ASSERT_OK(s);
1880
+ ASSERT_GT(block_handles.size(), 0);
1881
+
1882
+ {
1883
+ auto job = std::make_shared<IOJob>();
1884
+ job->block_handles = block_handles;
1885
+ job->table = table.get();
1886
+ job->job_options.read_options.async_io = async;
1887
+
1888
+ std::shared_ptr<ReadSet> read_set;
1889
+ s = dispatcher->SubmitJob(job, &read_set);
1890
+ ASSERT_OK(s);
1891
+ ASSERT_NE(read_set, nullptr);
1892
+
1893
+ uint64_t granted = stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED);
1894
+ ASSERT_GT(granted, 0);
1895
+
1896
+ // Read all blocks via ReadIndex (moves values out of pinned_blocks_)
1897
+ // but do NOT call ReleaseBlock — let the destructor handle cleanup
1898
+ for (size_t i = 0; i < block_handles.size(); ++i) {
1899
+ CachableEntry<Block> block;
1900
+ ASSERT_OK(read_set->ReadIndex(i, &block));
1901
+ }
1902
+ // read_set goes out of scope — destructor should release all memory
1903
+ }
1904
+
1905
+ uint64_t granted = stats->getTickerCount(PREFETCH_MEMORY_BYTES_GRANTED);
1906
+ uint64_t released = stats->getTickerCount(PREFETCH_MEMORY_BYTES_RELEASED);
1907
+ // Destructor should release memory for all prefetched blocks,
1908
+ // even those whose values were moved out by ReadIndex
1909
+ EXPECT_EQ(released, granted);
1910
+ }
1911
+ }
1912
+
1913
+ } // namespace ROCKSDB_NAMESPACE
1914
+
1915
+ int main(int argc, char** argv) {
1916
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
1917
+ ::testing::InitGoogleTest(&argc, argv);
1918
+ return RUN_ALL_TESTS();
1919
+ }