@nxtedition/rocksdb 15.4.1 → 15.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (399) hide show
  1. package/binding.cc +24 -15
  2. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  3. package/deps/rocksdb/rocksdb/BUCK +42 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  5. package/deps/rocksdb/rocksdb/Makefile +59 -32
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  8. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  9. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  10. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  11. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  14. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  17. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  24. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  26. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  27. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  28. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  29. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  31. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  32. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  51. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  52. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  53. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  54. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  55. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  57. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  58. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  59. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  60. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  61. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  62. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  63. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  64. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  65. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  66. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  79. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  80. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  81. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  82. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  83. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  84. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  85. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  86. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  87. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  88. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  90. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  91. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  92. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  93. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  94. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  95. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  96. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  97. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  98. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  99. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  100. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  101. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  102. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  103. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  104. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  105. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  109. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  111. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  112. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  113. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  114. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  115. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  116. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  117. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  118. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  119. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  120. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  121. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  122. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  123. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  124. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  125. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  126. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  127. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  128. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  129. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  130. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  131. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  132. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  133. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  134. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  135. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  136. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  137. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  138. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  139. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  140. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  141. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  142. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  143. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  144. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  145. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  146. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  147. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  148. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  150. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  151. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  160. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  161. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  162. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  163. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  164. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  165. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  166. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  167. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  168. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  169. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  170. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  171. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  172. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  173. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  174. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  175. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  176. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  177. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  179. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  180. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  181. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  182. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  183. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  185. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  187. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  188. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  189. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  192. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  193. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  194. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  198. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  200. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  202. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  203. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  204. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  205. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  206. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  210. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  211. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  212. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  213. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  214. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  215. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  216. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  217. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  218. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  219. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  220. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  221. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  222. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  223. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  224. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  225. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  226. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  227. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  228. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  229. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  230. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  231. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  232. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  233. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  234. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  235. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  236. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  237. package/deps/rocksdb/rocksdb/src.mk +12 -0
  238. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  239. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  240. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  241. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  242. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  243. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  253. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  254. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  255. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  256. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  257. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  258. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  259. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  260. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  261. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  264. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  265. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  266. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  267. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  268. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  269. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  270. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  273. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  274. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  275. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  276. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  277. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  278. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  279. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  280. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  281. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  282. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  283. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  284. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  286. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  287. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  288. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  289. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  290. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  291. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  292. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  293. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  294. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  295. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  296. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  297. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  298. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  299. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  300. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  301. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  302. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  303. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  304. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  305. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  306. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  307. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  308. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  309. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  310. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  311. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  312. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  313. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  314. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  315. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  316. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  317. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  318. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  319. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  320. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  321. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  322. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  323. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  324. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  325. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  326. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  327. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  328. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  329. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  331. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  332. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  333. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  334. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  335. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  336. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  337. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  338. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  339. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  340. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  341. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  342. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  343. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  344. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  355. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  356. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  358. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  360. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  361. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  362. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  364. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  365. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  366. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  367. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  368. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  369. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  370. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  371. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  373. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  375. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  376. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  377. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  378. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  380. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  381. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  388. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  389. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  390. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  391. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  392. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  393. package/deps/rocksdb/rocksdb.gyp +7 -0
  394. package/iterator.js +2 -2
  395. package/package.json +1 -1
  396. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  397. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  398. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  399. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -40,8 +40,11 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
40
40
  if (!multi_scan_status_.ok()) {
41
41
  return;
42
42
  }
43
- if (multi_scan_) {
44
- SeekMultiScan(target);
43
+
44
+ // MultiScan requires an explicit seek key — SeekToFirst() is not supported
45
+ if (multi_scan_read_set_ && !target) {
46
+ multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
47
+ RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS);
45
48
  return;
46
49
  }
47
50
 
@@ -67,7 +70,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
67
70
  read_options_.auto_readahead_size &&
68
71
  (read_options_.iterate_upper_bound || read_options_.prefix_same_as_start);
69
72
 
70
- if (autotune_readaheadsize &&
73
+ if (autotune_readaheadsize && !multi_scan_read_set_ &&
71
74
  table_->get_rep()->table_options.block_cache.get() &&
72
75
  direction_ == IterDirection::kForward) {
73
76
  readahead_cache_lookup_ = true;
@@ -97,8 +100,10 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
97
100
  // In case of readahead_cache_lookup_, index_iter_ could change to find the
98
101
  // readahead size in BlockCacheLookupForReadAheadSize so it needs to
99
102
  // reseek.
100
- if (IsIndexAtCurr() && block_iter_points_to_real_block_ &&
101
- block_iter_.Valid()) {
103
+ // MultiScan must always go through index_iter_->Seek() so that
104
+ // MultiScanIndexIterator can update its scan range tracking state.
105
+ if (!multi_scan_read_set_ && IsIndexAtCurr() &&
106
+ block_iter_points_to_real_block_ && block_iter_.Valid()) {
102
107
  // Reseek.
103
108
  prev_block_offset_ = index_iter_->value().handle.offset();
104
109
 
@@ -152,7 +157,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
152
157
  } else {
153
158
  // Need to use the data block.
154
159
  if (!same_block) {
155
- if (read_options_.async_io && async_prefetch) {
160
+ if (read_options_.async_io && async_prefetch && !multi_scan_read_set_) {
156
161
  AsyncInitDataBlock(/*is_first_pass=*/true);
157
162
  if (async_read_in_progress_) {
158
163
  // Status::TryAgain indicates asynchronous request for retrieval of
@@ -163,6 +168,10 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
163
168
  }
164
169
  } else {
165
170
  InitDataBlock();
171
+ if (multi_scan_read_set_ && !block_iter_points_to_real_block_) {
172
+ // MultiScan InitDataBlock failed (e.g., prefetch limit or IO error)
173
+ return;
174
+ }
166
175
  }
167
176
  } else {
168
177
  // When the user does a reseek, the iterate_upper_bound might have
@@ -184,12 +193,19 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
184
193
  CheckOutOfBound();
185
194
 
186
195
  if (target) {
187
- assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
196
+ // MultiScan uses user-key separators in its index, so after a reseek
197
+ // with the same user key but a different sequence number (e.g., from
198
+ // max_sequential_skip_in_iterations), the data block entry may appear
199
+ // "before" the target in internal key order. The user-key invariant
200
+ // still holds and the iteration is correct because DBIter will skip
201
+ // remaining same-user-key entries.
202
+ assert(multi_scan_read_set_ || !Valid() ||
203
+ icomp_.Compare(*target, key()) <= 0);
188
204
  }
189
205
  }
190
206
 
191
207
  void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
192
- multi_scan_.reset();
208
+ ResetMultiScan();
193
209
  direction_ = IterDirection::kBackward;
194
210
  ResetBlockCacheLookupVar();
195
211
  is_out_of_bound_ = false;
@@ -264,7 +280,7 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
264
280
  }
265
281
 
266
282
  void BlockBasedTableIterator::SeekToLast() {
267
- multi_scan_.reset();
283
+ ResetMultiScan();
268
284
  direction_ = IterDirection::kBackward;
269
285
  ResetBlockCacheLookupVar();
270
286
  is_out_of_bound_ = false;
@@ -290,7 +306,7 @@ void BlockBasedTableIterator::SeekToLast() {
290
306
  void BlockBasedTableIterator::Next() {
291
307
  assert(Valid());
292
308
  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
293
- assert(!multi_scan_);
309
+ assert(!multi_scan_read_set_);
294
310
  return;
295
311
  }
296
312
  assert(block_iter_points_to_real_block_);
@@ -311,9 +327,8 @@ bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
311
327
  }
312
328
 
313
329
  void BlockBasedTableIterator::Prev() {
314
- assert(!multi_scan_);
315
- if ((readahead_cache_lookup_ && !IsIndexAtCurr()) || multi_scan_) {
316
- multi_scan_.reset();
330
+ if ((readahead_cache_lookup_ && !IsIndexAtCurr()) || multi_scan_read_set_) {
331
+ ResetMultiScan();
317
332
  // In case of readahead_cache_lookup_, index_iter_ has moved forward. So we
318
333
  // need to reseek the index_iter_ to point to current block by using
319
334
  // block_iter_'s key.
@@ -358,6 +373,41 @@ void BlockBasedTableIterator::Prev() {
358
373
  }
359
374
 
360
375
  void BlockBasedTableIterator::InitDataBlock() {
376
+ // MultiScan path: load block from ReadSet
377
+ if (multi_scan_read_set_) {
378
+ BlockHandle data_block_handle = index_iter_->value().handle;
379
+ if (!block_iter_points_to_real_block_ ||
380
+ data_block_handle.offset() != prev_block_offset_) {
381
+ if (block_iter_points_to_real_block_) {
382
+ ResetDataIter();
383
+ }
384
+ size_t rs_idx = multi_scan_index_iter_->current_read_set_index();
385
+ if (rs_idx >= prefetch_max_idx_) {
386
+ if (multi_scan_index_iter_->GetMaxPrefetchSize() == 0) {
387
+ // max_prefetch_size is not set, treat as end of file
388
+ return;
389
+ } else {
390
+ // max_prefetch_size is set, treat as error
391
+ multi_scan_status_ = Status::PrefetchLimitReached();
392
+ return;
393
+ }
394
+ }
395
+ CachableEntry<Block> block_entry;
396
+ multi_scan_status_ =
397
+ multi_scan_read_set_->ReadIndex(rs_idx, &block_entry);
398
+ if (!multi_scan_status_.ok()) {
399
+ return;
400
+ }
401
+ table_->NewDataBlockIterator<DataBlockIter>(read_options_, block_entry,
402
+ &block_iter_, Status::OK());
403
+ block_iter_points_to_real_block_ = true;
404
+ prev_block_offset_ = data_block_handle.offset();
405
+ CheckDataBlockWithinUpperBound();
406
+ }
407
+ return;
408
+ }
409
+
410
+ // Regular path
361
411
  BlockHandle data_block_handle;
362
412
  bool is_in_cache = false;
363
413
  bool use_block_cache_for_lookup = true;
@@ -580,10 +630,6 @@ void BlockBasedTableIterator::FindKeyForward() {
580
630
  }
581
631
 
582
632
  void BlockBasedTableIterator::FindBlockForward() {
583
- if (multi_scan_) {
584
- FindBlockForwardInMultiScan();
585
- return;
586
- }
587
633
  // TODO the while loop inherits from two-level-iterator. We don't know
588
634
  // whether a block can be empty so it can be replaced by an "if".
589
635
  do {
@@ -594,8 +640,14 @@ void BlockBasedTableIterator::FindBlockForward() {
594
640
  // index_iter_ can point to different block in case of
595
641
  // readahead_cache_lookup_. readahead_cache_lookup_ will be handle the
596
642
  // upper_bound check.
643
+ // MultiScan handles scan range boundaries via IsScanRangeExhausted()
644
+ // after index_iter_->Next(), so we must not use the
645
+ // next_block_is_out_of_bound mechanism which can prematurely terminate
646
+ // a scan range when the block separator >= iterate_upper_bound but
647
+ // valid keys still remain in the current range's blocks.
597
648
  bool next_block_is_out_of_bound =
598
- IsIndexAtCurr() && read_options_.iterate_upper_bound != nullptr &&
649
+ !multi_scan_read_set_ && IsIndexAtCurr() &&
650
+ read_options_.iterate_upper_bound != nullptr &&
599
651
  block_iter_points_to_real_block_ &&
600
652
  block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
601
653
 
@@ -627,6 +679,18 @@ void BlockBasedTableIterator::FindBlockForward() {
627
679
  next_block_is_out_of_bound = is_index_out_of_bound_;
628
680
  is_index_out_of_bound_ = false;
629
681
  }
682
+ // MultiScan: detect scan range boundary after Next()
683
+ if (multi_scan_index_iter_ &&
684
+ multi_scan_index_iter_->IsScanRangeExhausted()) {
685
+ if (multi_scan_index_iter_->HasMoreScanRanges()) {
686
+ // More ranges remain — signal out-of-bound so DBIter/LevelIter
687
+ // will trigger the next Seek for the next scan range.
688
+ is_out_of_bound_ = true;
689
+ }
690
+ // For last range: index_iter_->Valid() is false, so we fall
691
+ // through to the !Valid() return below. LevelIterator advances.
692
+ return;
693
+ }
630
694
  } else {
631
695
  // Skip Next as index_iter_ already points to correct index when it
632
696
  // iterates in BlockCacheLookupForReadAheadSize.
@@ -658,6 +722,10 @@ void BlockBasedTableIterator::FindBlockForward() {
658
722
  }
659
723
  }
660
724
  InitDataBlock();
725
+ if (multi_scan_read_set_ && !block_iter_points_to_real_block_) {
726
+ // MultiScan InitDataBlock failed (prefetch limit or IO error)
727
+ return;
728
+ }
661
729
  block_iter_.SeekToFirst();
662
730
  } while (!block_iter_.Valid());
663
731
  }
@@ -767,7 +835,7 @@ void BlockBasedTableIterator::InitializeStartAndEndOffsets(
767
835
  // It can be when Reseek is from block cache (which doesn't clear the
768
836
  // buffers in FilePrefetchBuffer but clears block handles from queue) and
769
837
  // reseek also lies within the buffer. So Next will get data from
770
- // exisiting buffers untill this callback is made to prefetch additional
838
+ // existing buffers until this callback is made to prefetch additional
771
839
  // data. All handles need to be added to the queue starting from
772
840
  // index_iter_.
773
841
  assert(index_iter_->Valid());
@@ -919,42 +987,6 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
919
987
  ResetPreviousBlockOffset();
920
988
  }
921
989
 
922
- BlockBasedTableIterator::MultiScanState::~MultiScanState() {
923
- // Abort any pending async IO operations to prevent callback being called
924
- // after async read states are destructed.
925
- if (!async_states.empty()) {
926
- std::vector<void*> io_handles_to_abort;
927
- std::vector<AsyncReadState*> states_to_cleanup;
928
-
929
- // Collect all pending IO handles
930
- for (size_t i = 0; i < async_states.size(); ++i) {
931
- auto& async_read = async_states[i];
932
-
933
- if (async_read.io_handle != nullptr) {
934
- assert(!async_read.finished);
935
- io_handles_to_abort.push_back(async_read.io_handle);
936
- states_to_cleanup.push_back(&async_read);
937
- }
938
- }
939
-
940
- if (!io_handles_to_abort.empty()) {
941
- IOStatus abort_status = fs->AbortIO(io_handles_to_abort);
942
- if (!abort_status.ok()) {
943
- #ifndef NDEBUG
944
- fprintf(stderr, "Error aborting async IO operations: %s\n",
945
- abort_status.ToString().c_str());
946
- #endif
947
- assert(false);
948
- }
949
- (void)abort_status; // Suppress unused variable warning
950
- }
951
-
952
- for (auto async_read : states_to_cleanup) {
953
- async_read->CleanUpIOHandle();
954
- }
955
- }
956
- }
957
-
958
990
  // Note:
959
991
  // - Iterator should not be reused for multiple multiscans or mixing
960
992
  // multiscan with regular iterator usage.
@@ -977,14 +1009,20 @@ BlockBasedTableIterator::MultiScanState::~MultiScanState() {
977
1009
  // end key. These Seeks will be handled properly, as long as the target is
978
1010
  // moving forward.
979
1011
  void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
980
- assert(!multi_scan_);
1012
+ assert(!multi_scan_read_set_);
1013
+ RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_CALLS);
1014
+ StopWatch sw(table_->get_rep()->ioptions.clock, table_->GetStatistics(),
1015
+ MULTISCAN_PREPARE_MICROS);
1016
+
981
1017
  if (!index_iter_->status().ok()) {
982
1018
  multi_scan_status_ = index_iter_->status();
1019
+ RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
983
1020
  return;
984
1021
  }
985
- if (multi_scan_) {
986
- multi_scan_.reset();
1022
+ if (multi_scan_read_set_) {
1023
+ multi_scan_read_set_.reset();
987
1024
  multi_scan_status_ = Status::InvalidArgument("Prepare already called");
1025
+ RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
988
1026
  return;
989
1027
  }
990
1028
 
@@ -998,457 +1036,73 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
998
1036
  CollectBlockHandles(scan_opts, &scan_block_handles,
999
1037
  &block_index_ranges_per_scan, &data_block_separators);
1000
1038
  if (!multi_scan_status_.ok()) {
1039
+ RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
1001
1040
  return;
1002
1041
  }
1003
1042
 
1004
- // Pin already cached blocks, collect remaining blocks to read
1005
- std::vector<size_t> block_indices_to_read;
1006
- std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
1007
- scan_block_handles.size());
1008
- size_t prefetched_max_idx;
1009
- multi_scan_status_ = FilterAndPinCachedBlocks(
1010
- scan_block_handles, multiscan_opts, &block_indices_to_read,
1011
- &pinned_data_blocks_guard, &prefetched_max_idx);
1012
- if (!multi_scan_status_.ok()) {
1013
- return;
1014
- }
1015
-
1016
- std::vector<AsyncReadState> async_states;
1017
- // Maps from block index into async read request (index into async_states[])
1018
- UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
1019
- if (!block_indices_to_read.empty()) {
1020
- std::vector<FSReadRequest> read_reqs;
1021
- std::vector<std::vector<size_t>> coalesced_block_indices;
1022
- PrepareIORequests(block_indices_to_read, scan_block_handles, multiscan_opts,
1023
- &read_reqs, &block_idx_to_readreq_idx,
1024
- &coalesced_block_indices);
1025
-
1026
- multi_scan_status_ =
1027
- ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
1028
- &read_reqs, &async_states, &pinned_data_blocks_guard);
1029
- if (!multi_scan_status_.ok()) {
1030
- return;
1031
- }
1032
- }
1033
-
1034
- // Successful Prepare, init related states so the iterator reads from prepared
1035
- // blocks.
1036
- multi_scan_ = std::make_unique<MultiScanState>(
1037
- table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
1038
- std::move(pinned_data_blocks_guard), std::move(data_block_separators),
1039
- std::move(block_index_ranges_per_scan),
1040
- std::move(block_idx_to_readreq_idx), std::move(async_states),
1041
- prefetched_max_idx);
1042
-
1043
- is_index_at_curr_block_ = false;
1044
- block_iter_points_to_real_block_ = false;
1045
- }
1046
-
1047
- void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
1048
- assert(multi_scan_ && multi_scan_status_.ok());
1049
- // This is a MultiScan and Preapre() has been called.
1050
-
1051
- // Reset out of bound on seek, if it is out of bound again, it will be set
1052
- // properly later in the code path
1053
- is_out_of_bound_ = false;
1054
-
1055
- // Validate seek key with scan options
1056
- if (!seek_target) {
1057
- // start key must be set for multi-scan
1058
- multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
1059
- return;
1060
- }
1061
-
1062
- // Check the case where there is no range prepared on this table
1063
- if (multi_scan_->scan_opts->size() == 0) {
1064
- // out of bound
1065
- MarkPreparedRangeExhausted();
1066
- return;
1067
- }
1068
-
1069
- // Check whether seek key is moving forward.
1070
- if (multi_scan_->prev_seek_key_.empty() ||
1071
- icomp_.Compare(*seek_target, multi_scan_->prev_seek_key_) > 0) {
1072
- // If seek key is empty or is larger than previous seek key, update the
1073
- // previous seek key. Otherwise use the previous seek key as the adjusted
1074
- // seek target moving forward. This prevents seek target going backward,
1075
- // which would visit pages that have been unpinned.
1076
- // This issue is caused by sub-optimal range delete handling inside merge
1077
- // iterator.
1078
- // TODO xingbo issues:14068 : Optimize the handling of range delete iterator
1079
- // inside merge iterator, so that it doesn't move seek key backward. After
1080
- // that we could return error if the key moves backward here.
1081
- multi_scan_->prev_seek_key_ = seek_target->ToString();
1082
- } else {
1083
- // Seek key is adjusted to previous one, we can return here directly.
1084
- return;
1085
- }
1086
-
1087
- // There are 3 different Cases we need to handle:
1088
- // The following diagram explain different seek targets seeking at various
1089
- // position on the table, while the next_scan_idx points to the PreparedRange
1090
- // 2.
1091
- //
1092
- // next_scan_idx: -------------------┐
1093
- // ▼
1094
- // table: : __[PreparedRange 1]__[PreparedRange 2]__[PreparedRange 3]__
1095
- // Seek target: <----- Case 1 ------>▲<------------- Case 2 -------------->
1096
- // │
1097
- // Case 3
1098
- //
1099
- // Case 1: seek before the start of next prepared ranges. This could happen
1100
- // due to too many delete tomestone triggered reseek or delete range.
1101
- // Case 2: seek after the start of next prepared range.
1102
- // This could happen due to seek key adjustment from delete range file.
1103
- // E.g. LSM has 3 levels, each level has only 1 file:
1104
- // L1 : key : 0---10
1105
- // L2 : Delete range key : 0-5
1106
- // L3 : key : 0---10
1107
- // When a range 2-8 was prepared, the prepared key would be 2 on L3 file,
1108
- // but the seek key would be 5, as the seek key was updated by the largest
1109
- // key of delete range. This causes all of the cases above to be possible,
1110
- // when the ranges are adjusted in the above examples.
1111
- // Case 3: seek at the beginning of a prepared range (expected case)
1112
-
1113
- // Allow reseek on the start of the last prepared range due to too many
1114
- // tombstone
1115
- multi_scan_->next_scan_idx =
1116
- std::min(multi_scan_->next_scan_idx,
1117
- multi_scan_->block_index_ranges_per_scan.size() - 1);
1118
-
1119
- auto user_seek_target = ExtractUserKey(*seek_target);
1120
-
1121
- auto compare_next_scan_start_result =
1122
- user_comparator_.CompareWithoutTimestamp(
1123
- user_seek_target, /*a_has_ts=*/true,
1124
- multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
1125
- .range.start.value(),
1126
- /*b_has_ts=*/false);
1127
-
1128
- if (compare_next_scan_start_result != 0) {
1129
- // The seek target is not exactly same as what was prepared.
1130
- if (compare_next_scan_start_result < 0) {
1131
- // Case 1:
1132
- if (multi_scan_->next_scan_idx == 0) {
1133
- // This should not happen, even when seek target is adjusted by delete
1134
- // range. The reason is that if the seek target is before the start key
1135
- // of the first prepared range, its end key needs to be >= the smallest
1136
- // key of this file, otherwise it is skipped in level iterator. If its
1137
- // end key is >= the smallest key of this file, then this range will be
1138
- // prepared for this file. As delete range could only adjust seek
1139
- // target forward, so it would never be before the start key of the
1140
- // first prepared range.
1141
- assert(false && "Seek target before the first prepared range");
1142
- MarkPreparedRangeExhausted();
1143
- return;
1043
+ // Calculate prefetch_max_idx (enforces max_prefetch_size)
1044
+ size_t prefetch_max_idx = scan_block_handles.size();
1045
+ if (multiscan_opts->max_prefetch_size > 0) {
1046
+ uint64_t total_size = 0;
1047
+ for (size_t i = 0; i < scan_block_handles.size(); ++i) {
1048
+ total_size +=
1049
+ BlockBasedTable::BlockSizeWithTrailer(scan_block_handles[i]);
1050
+ if (total_size > multiscan_opts->max_prefetch_size) {
1051
+ prefetch_max_idx = i;
1052
+ break;
1144
1053
  }
1145
- auto seek_target_before_previous_prepared_range =
1146
- user_comparator_.CompareWithoutTimestamp(
1147
- user_seek_target, /*a_has_ts=*/true,
1148
- multi_scan_->scan_opts
1149
- ->GetScanRanges()[multi_scan_->next_scan_idx - 1]
1150
- .range.start.value(),
1151
- /*b_has_ts=*/false) < 0;
1152
- // Not expected to happen
1153
- // This should never happen, the reason is that the
1154
- // multi_scan_->next_scan_idx is set to a non zero value is due to a seek
1155
- // target larger or equal to the start key of multi_scan_->next_scan_idx-1
1156
- // happended earlier. If a seek happens before the start key of
1157
- // multi_scan_->next_scan_idx-1, it would seek a key that is less than
1158
- // what was seeked before.
1159
- assert(!seek_target_before_previous_prepared_range);
1160
- if (seek_target_before_previous_prepared_range) {
1161
- multi_scan_status_ = Status::InvalidArgument(
1162
- "Seek target is before the previous prepared range at index " +
1163
- std::to_string(multi_scan_->next_scan_idx));
1164
- return;
1165
- }
1166
- // It should only be possible to seek a key between the start of current
1167
- // prepared scan and start of next prepared range.
1168
- MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target);
1169
- } else {
1170
- // Case 2:
1171
- MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target);
1172
1054
  }
1173
- } else {
1174
- // Case 2:
1175
- assert(multi_scan_->next_scan_idx <
1176
- multi_scan_->block_index_ranges_per_scan.size());
1177
-
1178
- auto [cur_scan_start_idx, cur_scan_end_idx] =
1179
- multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
1180
- // We should have the data block already loaded
1181
- ++multi_scan_->next_scan_idx;
1182
- if (cur_scan_start_idx >= cur_scan_end_idx) {
1183
- // No blocks are prepared for this range at current file.
1184
- MarkPreparedRangeExhausted();
1185
- return;
1186
- }
1187
-
1188
- // max_sequential_skip_in_iterations can trigger a reseek on the start
1189
- // key of a scan range, even though the multiscan is already past
1190
- // `cur_scan_start_idx` (e.g., a user key spans multiple data blocks).
1191
- size_t block_idx =
1192
- std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx);
1193
- MultiScanSeekTargetFromBlock(seek_target, block_idx);
1194
1055
  }
1195
- }
1196
1056
 
1197
- void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
1198
- const Slice* seek_target, const Slice* user_seek_target) {
1199
- // linear search the block that contains the seek target, and unpin blocks
1200
- // that are before it.
1201
-
1202
- // The logic here could be confusing when there is a delete range involved.
1203
- // E.g. we have an LSM with 3 levels, each level has only 1 file:
1204
- // L1: data file : 0---10
1205
- // L2: Delete range : 0-5
1206
- // L3: data file : 0---10
1207
- //
1208
- // MultiScan on ranges 1-2, 3-4, and 5-6.
1209
- // When user first do Seek(1), on level 2, due to delete range 0-5, the seek
1210
- // key is adjusted to 5 at level 3. Therefore, we will internally do Seek(5)
1211
- // and unpins all blocks until 5 at level 3. Then the next scan's blocks from
1212
- // 3-4 are unpinned at level 3. It is confusing that maybe block 3-4 should
1213
- // not be unpinned, as next scan would need it. But Seek(5) implies that these
1214
- // keys are all covered by some range deletion, so the next Seek(3) will also
1215
- // do Seek(5) internally, so the blocks from 3-4 could be safely unpinned.
1216
-
1217
- // advance to the right prepared range
1218
- while (
1219
- multi_scan_->next_scan_idx <
1220
- multi_scan_->block_index_ranges_per_scan.size() &&
1221
- (user_comparator_.CompareWithoutTimestamp(
1222
- *user_seek_target, /*a_has_ts=*/true,
1223
- multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
1224
- .range.start.value(),
1225
- /*b_has_ts=*/false) >= 0)) {
1226
- multi_scan_->next_scan_idx++;
1057
+ // Create block handles vector for IODispatcher (limited to prefetch_max_idx)
1058
+ std::vector<BlockHandle> blocks_to_prefetch;
1059
+ if (prefetch_max_idx > 0) {
1060
+ blocks_to_prefetch.assign(scan_block_handles.begin(),
1061
+ scan_block_handles.begin() + prefetch_max_idx);
1227
1062
  }
1228
1063
 
1229
- // next_scan_idx is guaranteed to be higher than 0. If the seek key is before
1230
- // the start key of first prepared range, it is already handled by caller
1231
- // SeekMultiScan. It is equal, it would not call this funciton. If it is
1232
- // after, next_scan_idx would be advanced by the loop above.
1233
- assert(multi_scan_->next_scan_idx > 0);
1234
- // Get the current range
1235
- auto cur_scan_idx = multi_scan_->next_scan_idx - 1;
1236
- auto [cur_scan_start_idx, cur_scan_end_idx] =
1237
- multi_scan_->block_index_ranges_per_scan[cur_scan_idx];
1238
-
1239
- if (cur_scan_start_idx >= cur_scan_end_idx) {
1240
- // No blocks are prepared for this range at current file.
1241
- MarkPreparedRangeExhausted();
1242
- return;
1243
- }
1064
+ // Submit to IODispatcher
1065
+ auto job = std::make_shared<IOJob>();
1066
+ job->table = const_cast<BlockBasedTable*>(table_);
1067
+ job->block_handles = std::move(blocks_to_prefetch);
1068
+ job->job_options.io_coalesce_threshold =
1069
+ multiscan_opts->io_coalesce_threshold;
1070
+ job->job_options.read_options = read_options_;
1071
+ job->job_options.read_options.async_io = multiscan_opts->use_async_io;
1244
1072
 
1245
- // Unpin all the blocks from multi_scan_->cur_data_block_idx to
1246
- // cur_scan_start_idx
1247
- for (auto unpin_block_idx = multi_scan_->cur_data_block_idx;
1248
- unpin_block_idx < cur_scan_start_idx; unpin_block_idx++) {
1249
- if (!multi_scan_->pinned_data_blocks[unpin_block_idx].IsEmpty()) {
1250
- multi_scan_->pinned_data_blocks[unpin_block_idx].Reset();
1251
- }
1073
+ std::shared_ptr<ReadSet> read_set;
1074
+ // IODispatcher should be provided by DBIter::Prepare() to enable sharing
1075
+ // across all BlockBasedTableIterators in the scan. Create one if not
1076
+ // provided (for direct calls to Prepare, e.g., in unit tests).
1077
+ std::shared_ptr<IODispatcher> dispatcher = multiscan_opts->io_dispatcher;
1078
+ if (!dispatcher) {
1079
+ dispatcher.reset(NewIODispatcher());
1252
1080
  }
1253
-
1254
- // Take the max here to ensure we don't move backwards.
1255
- size_t block_idx =
1256
- std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx);
1257
- auto const& data_block_separators = multi_scan_->data_block_separators;
1258
- while (block_idx < data_block_separators.size() &&
1259
- (user_comparator_.CompareWithoutTimestamp(
1260
- *user_seek_target, /*a_has_ts=*/true,
1261
- data_block_separators[block_idx],
1262
- /*b_has_ts=*/false) > 0)) {
1263
- // Unpin the blocks that are passed
1264
- if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
1265
- multi_scan_->pinned_data_blocks[block_idx].Reset();
1266
- }
1267
- block_idx++;
1268
- }
1269
-
1270
- if (block_idx >= data_block_separators.size()) {
1271
- // All of the prepared blocks for this file is exhausted.
1272
- MarkPreparedRangeExhausted();
1081
+ multi_scan_status_ = dispatcher->SubmitJob(job, &read_set);
1082
+ if (!multi_scan_status_.ok()) {
1083
+ RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
1273
1084
  return;
1274
1085
  }
1275
1086
 
1276
- // The current block may contain the data for the target key
1277
- MultiScanSeekTargetFromBlock(seek_target, block_idx);
1278
- }
1279
-
1280
- void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
1281
- const Slice* seek_target, size_t block_idx) {
1282
- assert(multi_scan_->cur_data_block_idx <= block_idx);
1283
-
1284
- if (!block_iter_points_to_real_block_ ||
1285
- multi_scan_->cur_data_block_idx != block_idx) {
1286
- if (block_iter_points_to_real_block_) {
1287
- // Should be scan in increasing key range.
1288
- // All blocks before cur_data_block_idx_ are not pinned anymore.
1289
- assert(multi_scan_->cur_data_block_idx < block_idx);
1290
- }
1291
-
1292
- ResetDataIter();
1293
-
1294
- if (MultiScanLoadDataBlock(block_idx)) {
1295
- return;
1296
- }
1297
- }
1298
-
1299
- // Move current data block index forward until block_idx, meantime, unpin all
1300
- // the blocks in between
1301
- while (multi_scan_->cur_data_block_idx < block_idx) {
1302
- // unpin block
1303
- if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
1304
- .IsEmpty()) {
1305
- multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
1306
- }
1307
- multi_scan_->cur_data_block_idx++;
1308
- }
1309
- block_iter_points_to_real_block_ = true;
1310
- block_iter_.Seek(*seek_target);
1311
- FindKeyForward();
1312
- CheckOutOfBound();
1313
- }
1314
-
1315
- void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
1316
- assert(multi_scan_);
1317
- assert(multi_scan_->next_scan_idx >= 1);
1318
- const auto cur_scan_end_idx = std::get<1>(
1319
- multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
1320
- do {
1321
- if (!block_iter_.status().ok()) {
1322
- return;
1323
- }
1324
-
1325
- // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
1326
- // level has reached iterate_upper_bound_ and will not continue to iterate
1327
- // into the next file. When we are doing the last scan within a MultiScan
1328
- // for this file, it may need to continue to scan into the next file, so
1329
- // we do not set is_out_of_bound_ in this case.
1330
- if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
1331
- MarkPreparedRangeExhausted();
1332
- return;
1333
- }
1334
- // Move to the next pinned data block
1335
- ResetDataIter();
1336
- // Unpin previous block if it is not reset by data iterator
1337
- if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
1338
- .IsEmpty()) {
1339
- multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
1340
- }
1341
- ++multi_scan_->cur_data_block_idx;
1342
-
1343
- if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
1344
- return;
1345
- }
1346
-
1347
- block_iter_points_to_real_block_ = true;
1348
- block_iter_.SeekToFirst();
1349
- } while (!block_iter_.Valid());
1350
- }
1351
-
1352
- Status BlockBasedTableIterator::PollForBlock(size_t idx) {
1353
- assert(multi_scan_);
1354
- const auto async_idx = multi_scan_->block_idx_to_readreq_idx.find(idx);
1355
- if (async_idx == multi_scan_->block_idx_to_readreq_idx.end()) {
1356
- // Did not require async read, should already be pinned.
1357
- assert(multi_scan_->pinned_data_blocks[idx].GetValue());
1358
- return Status::OK();
1359
- }
1360
-
1361
- AsyncReadState& async_read = multi_scan_->async_states[async_idx->second];
1362
- if (async_read.finished) {
1363
- assert(async_read.io_handle == nullptr);
1364
- assert(async_read.status.ok());
1365
- return async_read.status;
1366
- }
1087
+ // Successful Prepare. Create MultiScanIndexIterator and swap it in as
1088
+ // the index iterator. The original index_iter_ is saved for restoration
1089
+ // on backward operations.
1090
+ // Note: data_block_separators keeps full size for seek logic, even though
1091
+ // only blocks up to prefetch_max_idx are actually prefetched.
1092
+ auto multi_scan_idx_iter = std::make_unique<MultiScanIndexIterator>(
1093
+ std::move(scan_block_handles), std::move(data_block_separators),
1094
+ std::move(block_index_ranges_per_scan), multiscan_opts, read_set,
1095
+ prefetch_max_idx, icomp_, table_->GetStatistics());
1096
+ assert(multi_scan_idx_iter->status().ok());
1367
1097
 
1368
- {
1369
- std::vector<void*> handles = {async_read.io_handle};
1370
- Status poll_s =
1371
- table_->get_rep()->ioptions.env->GetFileSystem()->Poll(handles, 1);
1372
- if (!poll_s.ok()) {
1373
- return poll_s;
1374
- }
1375
- }
1376
- assert(async_read.status.ok());
1377
- if (!async_read.status.ok()) {
1378
- return async_read.status;
1379
- }
1380
- async_read.CleanUpIOHandle();
1098
+ multi_scan_read_set_ = std::move(read_set);
1099
+ multi_scan_index_iter_ = multi_scan_idx_iter.get();
1100
+ prefetch_max_idx_ = prefetch_max_idx;
1101
+ original_index_iter_ = std::move(index_iter_);
1102
+ index_iter_ = std::move(multi_scan_idx_iter);
1381
1103
 
1382
- // Initialize and pin blocks from async read result.
1383
- for (size_t i = 0; i < async_read.blocks.size(); ++i) {
1384
- const auto& block = async_read.blocks[i];
1385
-
1386
- Status s = CreateAndPinBlockFromBuffer(
1387
- block, async_read.offset, async_read.result,
1388
- multi_scan_->pinned_data_blocks[async_read.block_indices[i]]);
1389
-
1390
- if (!s.ok()) {
1391
- return s;
1392
- }
1393
- assert(multi_scan_->pinned_data_blocks[async_read.block_indices[i]]
1394
- .GetValue());
1395
- }
1396
- assert(multi_scan_->pinned_data_blocks[idx].GetValue());
1397
- return Status::OK();
1398
- }
1399
-
1400
- Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
1401
- const BlockHandle& block, uint64_t buffer_start_offset,
1402
- const Slice& buffer_data, CachableEntry<Block>& pinned_block_entry) {
1403
- // Get decompressor and handle dictionary loading
1404
- UnownedPtr<Decompressor> decompressor = table_->get_rep()->decompressor.get();
1405
- CachableEntry<DecompressorDict> cached_dict;
1406
-
1407
- if (table_->get_rep()->uncompression_dict_reader) {
1408
- {
1409
- Status s =
1410
- table_->get_rep()
1411
- ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1412
- /* prefetch_buffer= */ nullptr, read_options_,
1413
- /* get_context= */ nullptr, /* lookup_context= */ nullptr,
1414
- &cached_dict);
1415
- if (!s.ok()) {
1416
- #ifndef NDEBUG
1417
- fprintf(stdout, "Prepare dictionary loading failed with %s\n",
1418
- s.ToString().c_str());
1419
- #endif
1420
- return s;
1421
- }
1422
- }
1423
- if (!cached_dict.GetValue()) {
1424
- #ifndef NDEBUG
1425
- fprintf(stdout, "Success but no dictionary read\n");
1426
- #endif
1427
- return Status::InvalidArgument("No dictionary found");
1428
- }
1429
- decompressor = cached_dict.GetValue()->decompressor_.get();
1430
- }
1431
-
1432
- // Create block from buffer data
1433
- const auto block_size_with_trailer =
1434
- BlockBasedTable::BlockSizeWithTrailer(block);
1435
- const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
1436
-
1437
- CacheAllocationPtr data =
1438
- AllocateBlock(block_size_with_trailer,
1439
- GetMemoryAllocator(table_->get_rep()->table_options));
1440
- memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
1441
- block_size_with_trailer);
1442
- BlockContents tmp_contents(std::move(data), block.size());
1443
-
1444
- #ifndef NDEBUG
1445
- tmp_contents.has_trailer =
1446
- table_->get_rep()->footer.GetBlockTrailerSize() > 0;
1447
- #endif
1448
-
1449
- return table_->CreateAndPinBlockInCache<Block_kData>(
1450
- read_options_, block, decompressor, &tmp_contents,
1451
- &pinned_block_entry.As<Block_kData>());
1104
+ is_index_at_curr_block_ = false;
1105
+ block_iter_points_to_real_block_ = false;
1452
1106
  }
1453
1107
 
1454
1108
  constexpr auto kVerbose = false;
@@ -1536,245 +1190,4 @@ Status BlockBasedTableIterator::CollectBlockHandles(
1536
1190
  return Status::OK();
1537
1191
  }
1538
1192
 
1539
- Status BlockBasedTableIterator::FilterAndPinCachedBlocks(
1540
- const std::vector<BlockHandle>& scan_block_handles,
1541
- const MultiScanArgs* multiscan_opts,
1542
- std::vector<size_t>* block_indices_to_read,
1543
- std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
1544
- size_t* prefetched_max_idx) {
1545
- uint64_t total_prefetch_size = 0;
1546
- *prefetched_max_idx = scan_block_handles.size();
1547
-
1548
- for (size_t i = 0; i < scan_block_handles.size(); ++i) {
1549
- const auto& data_block_handle = scan_block_handles[i];
1550
-
1551
- total_prefetch_size +=
1552
- BlockBasedTable::BlockSizeWithTrailer(data_block_handle);
1553
- if (multiscan_opts->max_prefetch_size > 0 &&
1554
- total_prefetch_size > multiscan_opts->max_prefetch_size) {
1555
- for (size_t j = i; j < scan_block_handles.size(); ++j) {
1556
- assert((*pinned_data_blocks_guard)[j].IsEmpty());
1557
- }
1558
- *prefetched_max_idx = i;
1559
- break;
1560
- }
1561
-
1562
- Status s = table_->LookupAndPinBlocksInCache<Block_kData>(
1563
- read_options_, data_block_handle,
1564
- &(*pinned_data_blocks_guard)[i].As<Block_kData>());
1565
-
1566
- if (!s.ok()) {
1567
- // Abort: block cache look up failed.
1568
- return s;
1569
- }
1570
- if (!(*pinned_data_blocks_guard)[i].GetValue()) {
1571
- // Block not in cache
1572
- block_indices_to_read->emplace_back(i);
1573
- }
1574
- }
1575
- return Status::OK();
1576
- }
1577
-
1578
- void BlockBasedTableIterator::PrepareIORequests(
1579
- const std::vector<size_t>& block_indices_to_read,
1580
- const std::vector<BlockHandle>& scan_block_handles,
1581
- const MultiScanArgs* multiscan_opts, std::vector<FSReadRequest>* read_reqs,
1582
- UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
1583
- std::vector<std::vector<size_t>>* coalesced_block_indices) {
1584
- assert(coalesced_block_indices->empty());
1585
- coalesced_block_indices->resize(1);
1586
-
1587
- for (const auto& block_idx : block_indices_to_read) {
1588
- if (!coalesced_block_indices->back().empty()) {
1589
- // Check if we can coalesce.
1590
- const auto& last_block_handle =
1591
- scan_block_handles[coalesced_block_indices->back().back()];
1592
- uint64_t last_block_end =
1593
- last_block_handle.offset() +
1594
- BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
1595
- uint64_t current_start = scan_block_handles[block_idx].offset();
1596
-
1597
- if (current_start >
1598
- last_block_end + multiscan_opts->io_coalesce_threshold) {
1599
- // new IO
1600
- coalesced_block_indices->emplace_back();
1601
- }
1602
- }
1603
- coalesced_block_indices->back().emplace_back(block_idx);
1604
- }
1605
-
1606
- assert(read_reqs->empty());
1607
- read_reqs->reserve(coalesced_block_indices->size());
1608
- for (const auto& block_indices : *coalesced_block_indices) {
1609
- assert(block_indices.size());
1610
- const auto& first_block_handle = scan_block_handles[block_indices[0]];
1611
- const auto& last_block_handle = scan_block_handles[block_indices.back()];
1612
-
1613
- const auto start_offset = first_block_handle.offset();
1614
- const auto end_offset =
1615
- last_block_handle.offset() +
1616
- BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
1617
- #ifndef NDEBUG
1618
- // Debug print for failing the assertion below.
1619
- if (start_offset >= end_offset) {
1620
- fprintf(stderr, "scan_block_handles: ");
1621
- for (const auto& block : scan_block_handles) {
1622
- fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
1623
- block.offset(), block.size());
1624
- }
1625
- fprintf(stderr,
1626
- "\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
1627
- first_block_handle.offset(), first_block_handle.size());
1628
- fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
1629
- last_block_handle.offset(), last_block_handle.size());
1630
-
1631
- fprintf(stderr, "coalesced_block_indices: ");
1632
- for (const auto& b : *coalesced_block_indices) {
1633
- fprintf(stderr, "[");
1634
- for (const auto& block_idx : b) {
1635
- fprintf(stderr, "%zu ", block_idx);
1636
- }
1637
- fprintf(stderr, "] ");
1638
- }
1639
- fprintf(stderr, "\ncurrent blocks: ");
1640
- for (const auto& block_idx : block_indices) {
1641
- fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
1642
- scan_block_handles[block_idx].offset(),
1643
- scan_block_handles[block_idx].size());
1644
- }
1645
- fprintf(stderr, "\n");
1646
- }
1647
- #endif // NDEBUG
1648
- assert(end_offset > start_offset);
1649
-
1650
- read_reqs->emplace_back();
1651
- read_reqs->back().offset = start_offset;
1652
- read_reqs->back().len = end_offset - start_offset;
1653
-
1654
- if (multiscan_opts->use_async_io) {
1655
- for (const auto& block_idx : block_indices) {
1656
- (*block_idx_to_readreq_idx)[block_idx] = read_reqs->size() - 1;
1657
- }
1658
- }
1659
- }
1660
- }
1661
-
1662
- Status BlockBasedTableIterator::ExecuteIO(
1663
- const std::vector<BlockHandle>& scan_block_handles,
1664
- const MultiScanArgs* multiscan_opts,
1665
- const std::vector<std::vector<size_t>>& coalesced_block_indices,
1666
- std::vector<FSReadRequest>* read_reqs,
1667
- std::vector<AsyncReadState>* async_states,
1668
- std::vector<CachableEntry<Block>>* pinned_data_blocks_guard) {
1669
- IOOptions io_opts;
1670
- Status s;
1671
- s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
1672
- if (!s.ok()) {
1673
- // Abort: PrepareIOOptions failed
1674
- return s;
1675
- }
1676
- const bool direct_io = table_->get_rep()->file->use_direct_io();
1677
-
1678
- if (multiscan_opts->use_async_io) {
1679
- async_states->resize(read_reqs->size());
1680
- for (size_t i = 0; i < read_reqs->size(); ++i) {
1681
- auto& read_req = (*read_reqs)[i];
1682
- auto& async_read = (*async_states)[i];
1683
-
1684
- async_read.finished = false;
1685
- async_read.offset = read_req.offset;
1686
- async_read.block_indices = coalesced_block_indices[i];
1687
- for (const auto idx : coalesced_block_indices[i]) {
1688
- async_read.blocks.emplace_back(scan_block_handles[idx]);
1689
- }
1690
-
1691
- if (direct_io) {
1692
- read_req.scratch = nullptr;
1693
- } else {
1694
- async_read.buf.reset(new char[read_req.len]);
1695
- read_req.scratch = async_read.buf.get();
1696
- }
1697
-
1698
- auto cb = std::bind(&BlockBasedTableIterator::PrepareReadAsyncCallBack,
1699
- this, std::placeholders::_1, std::placeholders::_2);
1700
- // TODO: for mmap, io_handle will not be set but callback will already
1701
- // be called.
1702
- s = table_->get_rep()->file.get()->ReadAsync(
1703
- read_req, io_opts, cb, &async_read, &async_read.io_handle,
1704
- &async_read.del_fn, direct_io ? &async_read.aligned_buf : nullptr);
1705
- if (!s.ok()) {
1706
- #ifndef NDEBUG
1707
- fprintf(stderr, "ReadAsync failed with %s\n", s.ToString().c_str());
1708
- #endif
1709
- assert(false);
1710
- return s;
1711
- }
1712
- for (auto& req : *read_reqs) {
1713
- if (!req.status.ok()) {
1714
- assert(false);
1715
- // Silence compiler warning about NRVO
1716
- s = req.status;
1717
- return s;
1718
- }
1719
- }
1720
- }
1721
- } else {
1722
- // Synchronous IO using MultiRead
1723
- std::unique_ptr<char[]> buf;
1724
-
1725
- if (direct_io) {
1726
- for (auto& read_req : *read_reqs) {
1727
- read_req.scratch = nullptr;
1728
- }
1729
- } else {
1730
- // TODO: optimize if FSSupportedOps::kFSBuffer is supported.
1731
- size_t total_len = 0;
1732
- for (const auto& req : *read_reqs) {
1733
- total_len += req.len;
1734
- }
1735
- buf.reset(new char[total_len]);
1736
- size_t offset = 0;
1737
- for (auto& read_req : *read_reqs) {
1738
- read_req.scratch = buf.get() + offset;
1739
- offset += read_req.len;
1740
- }
1741
- }
1742
-
1743
- AlignedBuf aligned_buf;
1744
- s = table_->get_rep()->file->MultiRead(io_opts, read_reqs->data(),
1745
- read_reqs->size(),
1746
- direct_io ? &aligned_buf : nullptr);
1747
- if (!s.ok()) {
1748
- return s;
1749
- }
1750
- for (auto& req : *read_reqs) {
1751
- if (!req.status.ok()) {
1752
- // Silence compiler warning about NRVO
1753
- s = req.status;
1754
- return s;
1755
- }
1756
- }
1757
-
1758
- // Init blocks and pin them in block cache.
1759
- assert(read_reqs->size() == coalesced_block_indices.size());
1760
- for (size_t i = 0; i < coalesced_block_indices.size(); i++) {
1761
- const auto& read_req = (*read_reqs)[i];
1762
- for (const auto& block_idx : coalesced_block_indices[i]) {
1763
- const auto& block = scan_block_handles[block_idx];
1764
-
1765
- assert((*pinned_data_blocks_guard)[block_idx].IsEmpty());
1766
- s = CreateAndPinBlockFromBuffer(block, read_req.offset, read_req.result,
1767
- (*pinned_data_blocks_guard)[block_idx]);
1768
- if (!s.ok()) {
1769
- assert(false);
1770
- // Abort: failed to create and pin block in cache
1771
- return s;
1772
- }
1773
- assert((*pinned_data_blocks_guard)[block_idx].GetValue());
1774
- }
1775
- }
1776
- }
1777
- return s;
1778
- }
1779
-
1780
1193
  } // namespace ROCKSDB_NAMESPACE