@nxtedition/rocksdb 13.5.13 → 14.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/binding.cc +33 -2
  2. package/binding.gyp +2 -2
  3. package/chained-batch.js +9 -16
  4. package/deps/rocksdb/rocksdb/BUCK +18 -1
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
  6. package/deps/rocksdb/rocksdb/Makefile +20 -9
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +90 -13
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -75
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.h +44 -36
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +184 -148
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +5 -11
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +116 -47
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +3 -6
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -1
  16. package/deps/rocksdb/rocksdb/db/builder.cc +4 -2
  17. package/deps/rocksdb/rocksdb/db/c.cc +207 -0
  18. package/deps/rocksdb/rocksdb/db/c_test.c +72 -0
  19. package/deps/rocksdb/rocksdb/db/column_family.cc +3 -2
  20. package/deps/rocksdb/rocksdb/db/column_family.h +5 -0
  21. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +2 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +51 -38
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +29 -12
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +5 -10
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +566 -366
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +131 -4
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +1 -0
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +4 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +13 -14
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +12 -7
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -10
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +97 -76
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +11 -14
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +16 -3
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +1 -0
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +448 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +22 -20
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +4 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +5 -5
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +7 -3
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_iter.cc +104 -0
  47. package/deps/rocksdb/rocksdb/db/db_iter.h +4 -11
  48. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +331 -58
  49. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +129 -0
  50. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +64 -0
  51. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +40 -0
  52. package/deps/rocksdb/rocksdb/db/db_test2.cc +25 -15
  53. package/deps/rocksdb/rocksdb/db/db_test_util.cc +42 -24
  54. package/deps/rocksdb/rocksdb/db/db_test_util.h +29 -14
  55. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +69 -36
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +0 -1
  57. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  58. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -4
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +8 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +275 -79
  61. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +23 -5
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +591 -175
  63. package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -4
  64. package/deps/rocksdb/rocksdb/db/log_reader.cc +5 -2
  65. package/deps/rocksdb/rocksdb/db/memtable.cc +84 -35
  66. package/deps/rocksdb/rocksdb/db/memtable.h +39 -34
  67. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -0
  68. package/deps/rocksdb/rocksdb/db/merge_operator.cc +1 -1
  69. package/deps/rocksdb/rocksdb/db/multi_scan.cc +11 -5
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +1 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -1
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +34 -14
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +28 -5
  74. package/deps/rocksdb/rocksdb/db/version_set.cc +159 -14
  75. package/deps/rocksdb/rocksdb/db/version_set.h +2 -0
  76. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -1
  77. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +60 -0
  78. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +16 -1
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +75 -10
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.cc +28 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +2 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +31 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +50 -2
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +57 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +0 -4
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +266 -35
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -6
  89. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +18 -2
  90. package/deps/rocksdb/rocksdb/env/env.cc +12 -0
  91. package/deps/rocksdb/rocksdb/env/env_test.cc +18 -0
  92. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +2 -0
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +9 -5
  94. package/deps/rocksdb/rocksdb/env/io_posix.cc +4 -2
  95. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +19 -0
  96. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -31
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +42 -9
  98. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +93 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +43 -49
  100. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +4 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +8 -6
  102. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +487 -0
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +11 -12
  104. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +135 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +12 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +8 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +12 -8
  110. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +3 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +19 -9
  112. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +219 -24
  113. package/deps/rocksdb/rocksdb/include/rocksdb/point_lock_bench_tool.h +14 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +2 -2
  115. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +1 -1
  116. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +7 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +16 -0
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +16 -4
  119. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +13 -0
  120. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +4 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +0 -2
  122. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +45 -0
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +1 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +1 -1
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +6 -1
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  128. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +3 -3
  129. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +77 -51
  130. package/deps/rocksdb/rocksdb/memtable/skiplist.h +10 -13
  131. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +16 -7
  132. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +9 -4
  133. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +2 -0
  134. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +6 -0
  135. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -1
  136. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  137. package/deps/rocksdb/rocksdb/options/options.cc +2 -0
  138. package/deps/rocksdb/rocksdb/options/options_helper.cc +9 -8
  139. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -5
  140. package/deps/rocksdb/rocksdb/port/mmap.cc +1 -1
  141. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +51 -0
  142. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +4 -0
  143. package/deps/rocksdb/rocksdb/src.mk +8 -2
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1125 -765
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +35 -24
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +29 -4
  147. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +732 -256
  148. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +225 -16
  149. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -26
  150. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -1
  151. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -75
  152. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +433 -141
  153. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +2 -0
  154. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +17 -10
  155. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy_impl.h +20 -0
  156. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +112 -85
  157. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +191 -36
  158. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +2 -2
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  160. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +108 -31
  161. package/deps/rocksdb/rocksdb/table/external_table.cc +7 -3
  162. package/deps/rocksdb/rocksdb/table/format.cc +6 -12
  163. package/deps/rocksdb/rocksdb/table/format.h +10 -0
  164. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  165. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +1 -1
  166. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -1
  167. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +5 -0
  168. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -1
  169. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +118 -46
  170. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +9 -8
  171. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  172. package/deps/rocksdb/rocksdb/table/table_properties.cc +16 -0
  173. package/deps/rocksdb/rocksdb/table/table_test.cc +1540 -155
  174. package/deps/rocksdb/rocksdb/test_util/testutil.h +21 -5
  175. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +26 -5
  176. package/deps/rocksdb/rocksdb/tools/ldb.cc +1 -2
  177. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +2 -0
  178. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -3
  179. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +133 -165
  180. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +173 -64
  181. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +69 -0
  182. package/deps/rocksdb/rocksdb/util/atomic.h +6 -0
  183. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +29 -20
  184. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +10 -6
  185. package/deps/rocksdb/rocksdb/util/bit_fields.h +338 -0
  186. package/deps/rocksdb/rocksdb/util/coding.h +3 -3
  187. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -2
  188. package/deps/rocksdb/rocksdb/util/compression.cc +777 -82
  189. package/deps/rocksdb/rocksdb/util/compression.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/compression_test.cc +5 -3
  191. package/deps/rocksdb/rocksdb/util/dynamic_bloom.cc +2 -2
  192. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +15 -14
  193. package/deps/rocksdb/rocksdb/util/interval_test.cc +102 -0
  194. package/deps/rocksdb/rocksdb/util/semaphore.h +164 -0
  195. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +10 -6
  196. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -2
  197. package/deps/rocksdb/rocksdb/util/slice_test.cc +136 -0
  198. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  199. package/deps/rocksdb/rocksdb/util/string_util.cc +2 -16
  200. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +1 -1
  201. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
  202. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +7 -4
  203. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +35 -14
  204. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_test.cc +2 -0
  205. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +5 -2
  206. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/any_lock_manager_test.h +244 -0
  207. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench.cc +18 -0
  208. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench_tool.cc +159 -0
  209. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +1244 -161
  210. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +66 -12
  211. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_stress_test.cc +103 -0
  212. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +1275 -8
  213. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +40 -262
  214. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test_common.h +78 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_validation_test_runner.h +469 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -6
  217. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +4 -0
  218. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +9 -1
  219. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +18 -9
  220. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +2 -0
  221. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +2 -1
  222. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +72 -44
  223. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +92 -15
  224. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +6 -20
  225. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +143 -112
  226. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +23 -16
  227. package/index.js +3 -3
  228. package/package.json +1 -1
  229. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  230. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  231. package/util.h +38 -12
  232. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc +0 -17
@@ -37,14 +37,14 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
37
37
  bool async_prefetch) {
38
38
  // TODO(hx235): set `seek_key_prefix_for_readahead_trimming_`
39
39
  // even when `target == nullptr` that is when `SeekToFirst()` is called
40
+ if (!multi_scan_status_.ok()) {
41
+ return;
42
+ }
40
43
  if (multi_scan_) {
41
- if (SeekMultiScan(target)) {
42
- return;
43
- }
44
+ SeekMultiScan(target);
45
+ return;
44
46
  }
45
47
 
46
- assert(!multi_scan_);
47
-
48
48
  if (target != nullptr && prefix_extractor_ &&
49
49
  read_options_.prefix_same_as_start) {
50
50
  const Slice& seek_user_key = ExtractUserKey(*target);
@@ -919,351 +919,827 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
919
919
  ResetPreviousBlockOffset();
920
920
  }
921
921
 
922
+ BlockBasedTableIterator::MultiScanState::~MultiScanState() {
923
+ // Abort any pending async IO operations to prevent callback being called
924
+ // after async read states are destructed.
925
+ if (!async_states.empty()) {
926
+ std::vector<void*> io_handles_to_abort;
927
+ std::vector<AsyncReadState*> states_to_cleanup;
928
+
929
+ // Collect all pending IO handles
930
+ for (size_t i = 0; i < async_states.size(); ++i) {
931
+ auto& async_read = async_states[i];
932
+
933
+ if (async_read.io_handle != nullptr) {
934
+ assert(!async_read.finished);
935
+ io_handles_to_abort.push_back(async_read.io_handle);
936
+ states_to_cleanup.push_back(&async_read);
937
+ }
938
+ }
939
+
940
+ if (!io_handles_to_abort.empty()) {
941
+ IOStatus abort_status = fs->AbortIO(io_handles_to_abort);
942
+ if (!abort_status.ok()) {
943
+ #ifndef NDEBUG
944
+ fprintf(stderr, "Error aborting async IO operations: %s\n",
945
+ abort_status.ToString().c_str());
946
+ #endif
947
+ assert(false);
948
+ }
949
+ (void)abort_status; // Suppress unused variable warning
950
+ }
951
+
952
+ for (auto async_read : states_to_cleanup) {
953
+ async_read->CleanUpIOHandle();
954
+ }
955
+ }
956
+ }
957
+
922
958
  // Note:
923
959
  // - Iterator should not be reused for multiple multiscans or mixing
924
960
  // multiscan with regular iterator usage.
925
961
  // - scan ranges should be non-overlapping, and have increasing start keys.
926
962
  // If a scan range's limit is not set, then there should only be one scan range.
927
963
  // - After Prepare(), the iterator expects Seek to be called on the start key
928
- // of each ScanOption in order. If any other seek is done, the optimization here
929
- // is aborted and fall back to vanilla iterator.
964
+ // of each ScanOption in order. If any other Seek is done, an error status is
965
+ // returned
966
+ // - Whenever all blocks of a scan opt are exhausted, the iterator will become
967
+ // invalid and UpperBoundCheckResult() will return kOutOfBound. So that the
968
+ // upper layer (LevelIterator) will stop scanning instead thinking EOF is
969
+ // reached and continue into the next file. The only exception is for the last
970
+ // scan opt. If we reach the end of the last scan opt, UpperBoundCheckResult()
971
+ // will return kUnknown instead of kOutOfBound. This mechanism requires that
972
+ // scan opts are properly pruned such that there is no scan opt that is after
973
+ // this file's key range.
930
974
  // FIXME: DBIter and MergingIterator may
931
975
  // internally do Seek() on child iterators, e.g. due to
932
976
  // ReadOptions::max_skippable_internal_keys or reseeking into range deletion
933
- // end key. So these Seeks can cause iterator to fall back to normal
934
- // (non-prepared) iterator and ignore the optimizations done in Prepare().
935
- void BlockBasedTableIterator::Prepare(
936
- const std::vector<ScanOptions>* scan_opts) {
937
- index_iter_->Prepare(scan_opts);
938
-
977
+ // end key. These Seeks will be handled properly, as long as the target is
978
+ // moving forward.
979
+ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
939
980
  assert(!multi_scan_);
981
+ if (!index_iter_->status().ok()) {
982
+ multi_scan_status_ = index_iter_->status();
983
+ return;
984
+ }
940
985
  if (multi_scan_) {
941
986
  multi_scan_.reset();
987
+ multi_scan_status_ = Status::InvalidArgument("Prepare already called");
942
988
  return;
943
989
  }
944
- if (scan_opts == nullptr || scan_opts->empty()) {
990
+
991
+ index_iter_->Prepare(multiscan_opts);
992
+
993
+ std::vector<BlockHandle> scan_block_handles;
994
+ std::vector<std::string> data_block_separators;
995
+ std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
996
+ const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
997
+ multi_scan_status_ =
998
+ CollectBlockHandles(scan_opts, &scan_block_handles,
999
+ &block_index_ranges_per_scan, &data_block_separators);
1000
+ if (!multi_scan_status_.ok()) {
945
1001
  return;
946
1002
  }
947
- const bool has_limit = scan_opts->front().range.limit.has_value();
948
- if (!has_limit && scan_opts->size() > 1) {
949
- // Abort: overlapping ranges
1003
+
1004
+ // Pin already cached blocks, collect remaining blocks to read
1005
+ std::vector<size_t> block_indices_to_read;
1006
+ std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
1007
+ scan_block_handles.size());
1008
+ size_t prefetched_max_idx;
1009
+ multi_scan_status_ = FilterAndPinCachedBlocks(
1010
+ scan_block_handles, multiscan_opts, &block_indices_to_read,
1011
+ &pinned_data_blocks_guard, &prefetched_max_idx);
1012
+ if (!multi_scan_status_.ok()) {
950
1013
  return;
951
1014
  }
952
1015
 
953
- // Validate scan ranges to be increasing and with limit.
954
- for (size_t i = 0; i < scan_opts->size(); ++i) {
955
- const auto& scan_range = (*scan_opts)[i].range;
956
- if (!scan_range.start.has_value()) {
957
- // Abort: no start key
1016
+ std::vector<AsyncReadState> async_states;
1017
+ // Maps from block index into async read request (index into async_states[])
1018
+ UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
1019
+ if (!block_indices_to_read.empty()) {
1020
+ std::vector<FSReadRequest> read_reqs;
1021
+ std::vector<std::vector<size_t>> coalesced_block_indices;
1022
+ PrepareIORequests(block_indices_to_read, scan_block_handles, multiscan_opts,
1023
+ &read_reqs, &block_idx_to_readreq_idx,
1024
+ &coalesced_block_indices);
1025
+
1026
+ multi_scan_status_ =
1027
+ ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
1028
+ &read_reqs, &async_states, &pinned_data_blocks_guard);
1029
+ if (!multi_scan_status_.ok()) {
958
1030
  return;
959
1031
  }
1032
+ }
1033
+
1034
+ // Successful Prepare, init related states so the iterator reads from prepared
1035
+ // blocks.
1036
+ multi_scan_ = std::make_unique<MultiScanState>(
1037
+ table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
1038
+ std::move(pinned_data_blocks_guard), std::move(data_block_separators),
1039
+ std::move(block_index_ranges_per_scan),
1040
+ std::move(block_idx_to_readreq_idx), std::move(async_states),
1041
+ prefetched_max_idx);
960
1042
 
961
- // Assume for each scan range start <= limit.
962
- if (scan_range.limit.has_value()) {
963
- assert(user_comparator_.Compare(scan_range.start.value(),
964
- scan_range.limit.value()) <= 0);
1043
+ is_index_at_curr_block_ = false;
1044
+ block_iter_points_to_real_block_ = false;
1045
+ }
1046
+
1047
+ void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
1048
+ if (SeekMultiScanImpl(seek_target)) {
1049
+ is_out_of_bound_ = true;
1050
+ assert(!Valid());
1051
+ }
1052
+ }
1053
+
1054
+ bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
1055
+ assert(multi_scan_ && multi_scan_status_.ok());
1056
+ // This is a MultiScan and Preapre() has been called.
1057
+
1058
+ // Reset out of bound on seek, if it is out of bound again, it will be set
1059
+ // properly later in the code path
1060
+ is_out_of_bound_ = false;
1061
+
1062
+ // Validate seek key with scan options
1063
+ if (!seek_target) {
1064
+ // start key must be set for multi-scan
1065
+ multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
1066
+ return false;
1067
+ }
1068
+
1069
+ constexpr auto out_of_bound = true;
1070
+
1071
+ // Check the case where there is no range prepared on this table
1072
+ if (multi_scan_->scan_opts->size() == 0) {
1073
+ // out of bound
1074
+ return out_of_bound;
1075
+ }
1076
+
1077
+ // Check whether seek key is moving forward.
1078
+ if (!multi_scan_->prev_seek_key_.empty()) {
1079
+ if (user_comparator_.CompareWithoutTimestamp(ExtractUserKey(*seek_target),
1080
+ /*a_has_ts=*/true,
1081
+ multi_scan_->prev_seek_key_,
1082
+ /*b_has_ts=*/false) < 0) {
1083
+ // The seek target moved backward
1084
+ multi_scan_status_ =
1085
+ Status::InvalidArgument("Unexpected seek key moving backward");
1086
+ return false;
965
1087
  }
1088
+ }
1089
+ multi_scan_->prev_seek_key_ = ExtractUserKey(*seek_target).ToString();
1090
+
1091
+ // There are still a few cases we need to handle
1092
+ // table: _____[prepared range 1]_____[prepared range 2]_____
1093
+ // seek : 1 2 3 4 5
1094
+ // Case 1: seek before the first prepared ranges, return out of bound
1095
+ // Case 2: seek at the beginning of a prepared range (expected case)
1096
+ // Case 3: seek within a prepared range (unexpected, but supported)
1097
+ // Case 4: seek between 2 of the prepared ranges, return out of bound
1098
+ // Case 5: seek after all of the prepared ranges, should move on to next file
1099
+ // The reason this could happen is due to seek key adjustment due to delete
1100
+ // range file.
1101
+ // E.g. LSM has 3 levels, each level has only 1 file:
1102
+ // L1 : key : 0---10
1103
+ // L2 : Delete range key : 0-5
1104
+ // L3 : key : 0---10
1105
+ // When a range 2-8 was prepared, the prepared key would be 2 on L3 file, but
1106
+ // the seek key would be 5, as the seek key was updated by the largest key of
1107
+ // delete range. This causes all of the cases above to be possible, when the
1108
+ // ranges are adjusted in the above examples.
1109
+
1110
+ // Allow reseek on the start of the last prepared range due to too many
1111
+ // tombstone
1112
+ multi_scan_->next_scan_idx =
1113
+ std::min(multi_scan_->next_scan_idx,
1114
+ multi_scan_->block_index_ranges_per_scan.size() - 1);
1115
+
1116
+ auto compare_next_scan_start_result =
1117
+ user_comparator_.CompareWithoutTimestamp(
1118
+ ExtractUserKey(*seek_target), /*a_has_ts=*/true,
1119
+ multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
1120
+ .range.start.value(),
1121
+ /*b_has_ts=*/false);
1122
+
1123
+ if (compare_next_scan_start_result != 0) {
1124
+ // The seek key is not exactly same as what was prepared.
1125
+ if (compare_next_scan_start_result < 0) {
1126
+ // Needs to handle Cases: 1, 3, 4
1127
+ //
1128
+ // next_scan_idx : |
1129
+ // V
1130
+ // table: _____[prepared range 1]_____[prepared range 2]_____
1131
+ // seek : 1 3 4
1132
+
1133
+ // Case 1: Seek key is before the start key of the first range
1134
+ if (multi_scan_->next_scan_idx == 0) {
1135
+ return out_of_bound;
1136
+ }
1137
+ // Case: 3, 4
1138
+ MultiScanUnexpectedSeekTarget(
1139
+ seek_target, std::get<0>(multi_scan_->block_index_ranges_per_scan
1140
+ [multi_scan_->next_scan_idx - 1]));
966
1141
 
967
- if (i > 0) {
968
- if (!scan_range.limit.has_value()) {
969
- // multiple no limit scan ranges
970
- return;
1142
+ } else {
1143
+ // Needs to handle Cases: 3, 4, 5
1144
+ // next_scan_idx :|
1145
+ // V
1146
+ // table: ____[prepared range 1]_____[prepared range 2]_____
1147
+ // seek : 3 4 5
1148
+ MultiScanUnexpectedSeekTarget(
1149
+ seek_target,
1150
+ std::get<0>(
1151
+ multi_scan_
1152
+ ->block_index_ranges_per_scan[multi_scan_->next_scan_idx]));
1153
+ }
1154
+ } else {
1155
+ if (multi_scan_->next_scan_idx >=
1156
+ multi_scan_->block_index_ranges_per_scan.size()) {
1157
+ // Seeking a range that is out side of prepared ranges.
1158
+ return out_of_bound;
1159
+ }
1160
+ // unpin block, then do a seek.
1161
+ if (multi_scan_->next_scan_idx > 0) {
1162
+ UnpinPreviousScanBlocks(multi_scan_->next_scan_idx);
1163
+ }
1164
+
1165
+ auto [cur_scan_start_idx, cur_scan_end_idx] =
1166
+ multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
1167
+ // We should have the data block already loaded
1168
+ ++multi_scan_->next_scan_idx;
1169
+ if (cur_scan_start_idx >= cur_scan_end_idx) {
1170
+ if (multi_scan_->next_scan_idx <
1171
+ multi_scan_->block_index_ranges_per_scan.size()) {
1172
+ return out_of_bound;
1173
+ } else {
1174
+ ResetDataIter();
1175
+ return false;
971
1176
  }
1177
+ } else {
1178
+ is_out_of_bound_ = false;
1179
+ }
1180
+
1181
+ MultiScanSeekTargetFromBlock(seek_target, cur_scan_start_idx);
1182
+ }
1183
+
1184
+ return false;
1185
+ }
1186
+
1187
+ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
1188
+ const Slice* seek_target, size_t block_idx) {
1189
+ // linear search the block that contains the seek target, and unpin blocks
1190
+ // that are before it.
1191
+ auto const& data_block_separators = multi_scan_->data_block_separators;
1192
+ while (block_idx < data_block_separators.size() &&
1193
+ (user_comparator_.CompareWithoutTimestamp(
1194
+ ExtractUserKey(*seek_target), /*a_has_ts=*/true,
1195
+ data_block_separators[block_idx],
1196
+ /*b_has_ts=*/false) > 0)) {
1197
+ if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
1198
+ multi_scan_->pinned_data_blocks[block_idx].Reset();
1199
+ }
1200
+ block_idx++;
1201
+ }
1202
+
1203
+ if (block_idx >= data_block_separators.size()) {
1204
+ // Handle case 5, when seek key is larger than the last block in the last
1205
+ // prepared range.
1206
+ ResetDataIter();
1207
+ assert(!Valid());
1208
+ return;
1209
+ }
1210
+
1211
+ // // The iterator from previous seek may have moved forward a few blocks,
1212
+ // // In that case, have block_idx catch up the cur_data_block_idx
1213
+ // // Note no need to handle block unpin, as it has been handled during
1214
+ // iterating block_idx = std::max(block_idx, multi_scan_->cur_data_block_idx);
1215
+
1216
+ // advance to the right prepared range
1217
+ while (
1218
+ multi_scan_->next_scan_idx <
1219
+ multi_scan_->block_index_ranges_per_scan.size() &&
1220
+ (user_comparator_.CompareWithoutTimestamp(
1221
+ ExtractUserKey(*seek_target), /*a_has_ts=*/true,
1222
+ multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
1223
+ .range.start.value(),
1224
+ /*b_has_ts=*/false) >= 0)) {
1225
+ multi_scan_->next_scan_idx++;
1226
+ }
1227
+
1228
+ // The current block may contain the data for the target key
1229
+ MultiScanSeekTargetFromBlock(seek_target, block_idx);
1230
+ }
1231
+
1232
+ void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
1233
+ const Slice* seek_target, size_t block_idx) {
1234
+ if (!block_iter_points_to_real_block_ ||
1235
+ multi_scan_->cur_data_block_idx != block_idx) {
1236
+ if (block_iter_points_to_real_block_) {
1237
+ // Should be scan in increasing key range.
1238
+ // All blocks before cur_data_block_idx_ are not pinned anymore.
1239
+ assert(multi_scan_->cur_data_block_idx < block_idx);
1240
+ }
1241
+
1242
+ ResetDataIter();
1243
+
1244
+ if (MultiScanLoadDataBlock(block_idx)) {
1245
+ return;
1246
+ }
1247
+ }
1248
+ multi_scan_->cur_data_block_idx = block_idx;
1249
+ block_iter_points_to_real_block_ = true;
1250
+ block_iter_.Seek(*seek_target);
1251
+ FindKeyForward();
1252
+ }
1253
+
1254
+ void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
1255
+ // TODO: support aborting and clearn up async IO requests, currently
1256
+ // only unpins already initialized blocks
1257
+ assert(multi_scan_);
1258
+ assert(current_scan_idx < multi_scan_->block_index_ranges_per_scan.size());
1259
+ if (current_scan_idx == 0) return;
1260
+
1261
+ auto prev_start_block_idx = std::get<0>(
1262
+ multi_scan_->block_index_ranges_per_scan[current_scan_idx - 1]);
1263
+ // Since a block can be shared between consecutive scans, we need
1264
+ // curr_start_block_idx here instead of just release blocks
1265
+ // up to the end of previous range block index.
1266
+ auto curr_start_block_idx =
1267
+ std::get<0>(multi_scan_->block_index_ranges_per_scan[current_scan_idx]);
1268
+ for (size_t block_idx = prev_start_block_idx;
1269
+ block_idx < curr_start_block_idx; ++block_idx) {
1270
+ if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
1271
+ multi_scan_->pinned_data_blocks[block_idx].Reset();
1272
+ }
1273
+ }
1274
+ }
972
1275
 
973
- const auto& last_end_key = (*scan_opts)[i - 1].range.limit.value();
974
- if (user_comparator_.Compare(scan_range.start.value(), last_end_key) <
975
- 0) {
976
- // Abort: overlapping ranges
1276
+ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
1277
+ assert(multi_scan_);
1278
+ assert(multi_scan_->next_scan_idx >= 1);
1279
+ const auto cur_scan_end_idx = std::get<1>(
1280
+ multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
1281
+ do {
1282
+ if (!block_iter_.status().ok()) {
1283
+ return;
1284
+ }
1285
+
1286
+ // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
1287
+ // level has reached iterate_upper_bound_ and will not continue to iterate
1288
+ // into the next file. When we are doing the last scan within a MultiScan
1289
+ // for this file, it may need to continue to scan into the next file, so
1290
+ // we do not set is_out_of_bound_ in this case.
1291
+ if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
1292
+ if (multi_scan_->next_scan_idx >=
1293
+ multi_scan_->block_index_ranges_per_scan.size()) {
1294
+ // We are done with this file, should let LevelIter advance to the
1295
+ // next file instead of ending the scan
1296
+ ResetDataIter();
1297
+ assert(!is_out_of_bound_);
1298
+ assert(!Valid());
977
1299
  return;
978
1300
  }
1301
+ // We don't ResetDataIter() here since next scan might be reading from
1302
+ // the same block. ResetDataIter() will free the underlying block cache
1303
+ // handle and we don't want the block to be unpinned.
1304
+ is_out_of_bound_ = true;
1305
+ assert(!Valid());
1306
+ return;
1307
+ }
1308
+ // Move to the next pinned data block
1309
+ ResetDataIter();
1310
+ ++multi_scan_->cur_data_block_idx;
1311
+
1312
+ if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
1313
+ return;
979
1314
  }
1315
+
1316
+ block_iter_points_to_real_block_ = true;
1317
+ block_iter_.SeekToFirst();
1318
+ } while (!block_iter_.Valid());
1319
+ }
1320
+
1321
+ Status BlockBasedTableIterator::PollForBlock(size_t idx) {
1322
+ assert(multi_scan_);
1323
+ const auto async_idx = multi_scan_->block_idx_to_readreq_idx.find(idx);
1324
+ if (async_idx == multi_scan_->block_idx_to_readreq_idx.end()) {
1325
+ // Did not require async read, should already be pinned.
1326
+ assert(multi_scan_->pinned_data_blocks[idx].GetValue());
1327
+ return Status::OK();
980
1328
  }
981
1329
 
982
- // Gather all relevant data block handles
983
- std::vector<BlockHandle> blocks_to_prepare;
984
- Status s;
985
- std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
986
- for (const auto& scan_opt : *scan_opts) {
987
- size_t num_blocks = 0;
988
- // Current scan overlap the last block of the previous scan.
989
- bool check_overlap = !blocks_to_prepare.empty();
1330
+ AsyncReadState& async_read = multi_scan_->async_states[async_idx->second];
1331
+ if (async_read.finished) {
1332
+ assert(async_read.io_handle == nullptr);
1333
+ assert(async_read.status.ok());
1334
+ return async_read.status;
1335
+ }
1336
+
1337
+ {
1338
+ std::vector<void*> handles = {async_read.io_handle};
1339
+ Status poll_s =
1340
+ table_->get_rep()->ioptions.env->GetFileSystem()->Poll(handles, 1);
1341
+ if (!poll_s.ok()) {
1342
+ return poll_s;
1343
+ }
1344
+ }
1345
+ assert(async_read.status.ok());
1346
+ if (!async_read.status.ok()) {
1347
+ return async_read.status;
1348
+ }
1349
+ async_read.CleanUpIOHandle();
1350
+
1351
+ // Initialize and pin blocks from async read result.
1352
+ for (size_t i = 0; i < async_read.blocks.size(); ++i) {
1353
+ const auto& block = async_read.blocks[i];
1354
+
1355
+ Status s = CreateAndPinBlockFromBuffer(
1356
+ block, async_read.offset, async_read.result,
1357
+ multi_scan_->pinned_data_blocks[async_read.block_indices[i]]);
1358
+
1359
+ if (!s.ok()) {
1360
+ return s;
1361
+ }
1362
+ assert(multi_scan_->pinned_data_blocks[async_read.block_indices[i]]
1363
+ .GetValue());
1364
+ }
1365
+ assert(multi_scan_->pinned_data_blocks[idx].GetValue());
1366
+ return Status::OK();
1367
+ }
1368
+
1369
+ Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
1370
+ const BlockHandle& block, uint64_t buffer_start_offset,
1371
+ const Slice& buffer_data, CachableEntry<Block>& pinned_block_entry) {
1372
+ // Get decompressor and handle dictionary loading
1373
+ UnownedPtr<Decompressor> decompressor = table_->get_rep()->decompressor.get();
1374
+ CachableEntry<DecompressorDict> cached_dict;
1375
+
1376
+ if (table_->get_rep()->uncompression_dict_reader) {
1377
+ {
1378
+ Status s =
1379
+ table_->get_rep()
1380
+ ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1381
+ /* prefetch_buffer= */ nullptr, read_options_,
1382
+ /* get_context= */ nullptr, /* lookup_context= */ nullptr,
1383
+ &cached_dict);
1384
+ if (!s.ok()) {
1385
+ #ifndef NDEBUG
1386
+ fprintf(stdout, "Prepare dictionary loading failed with %s\n",
1387
+ s.ToString().c_str());
1388
+ #endif
1389
+ return s;
1390
+ }
1391
+ }
1392
+ if (!cached_dict.GetValue()) {
1393
+ #ifndef NDEBUG
1394
+ fprintf(stdout, "Success but no dictionary read\n");
1395
+ #endif
1396
+ return Status::InvalidArgument("No dictionary found");
1397
+ }
1398
+ decompressor = cached_dict.GetValue()->decompressor_.get();
1399
+ }
1400
+
1401
+ // Create block from buffer data
1402
+ const auto block_size_with_trailer =
1403
+ BlockBasedTable::BlockSizeWithTrailer(block);
1404
+ const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
1405
+
1406
+ CacheAllocationPtr data =
1407
+ AllocateBlock(block_size_with_trailer,
1408
+ GetMemoryAllocator(table_->get_rep()->table_options));
1409
+ memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
1410
+ block_size_with_trailer);
1411
+ BlockContents tmp_contents(std::move(data), block.size());
1412
+
1413
+ #ifndef NDEBUG
1414
+ tmp_contents.has_trailer =
1415
+ table_->get_rep()->footer.GetBlockTrailerSize() > 0;
1416
+ #endif
990
1417
 
991
- // Scan range is specified in user key, here we seek to the minimum internal
992
- // key with this user key.
993
- InternalKey start_key(scan_opt.range.start.value(), kMaxSequenceNumber,
994
- kValueTypeForSeek);
1418
+ return table_->CreateAndPinBlockInCache<Block_kData>(
1419
+ read_options_, block, decompressor, &tmp_contents,
1420
+ &pinned_block_entry.As<Block_kData>());
1421
+ }
1422
+
1423
+ constexpr auto kVerbose = false;
1424
+
1425
+ Status BlockBasedTableIterator::CollectBlockHandles(
1426
+ const std::vector<ScanOptions>& scan_opts,
1427
+ std::vector<BlockHandle>* scan_block_handles,
1428
+ std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
1429
+ std::vector<std::string>* data_block_separators) {
1430
+ // print file name and level
1431
+ if (kVerbose) {
1432
+ auto file_name = table_->get_rep()->file->file_name();
1433
+ auto level = table_->get_rep()->level;
1434
+ printf("file name : %s, level %d\n", file_name.c_str(), level);
1435
+ }
1436
+ for (const auto& scan_opt : scan_opts) {
1437
+ size_t num_blocks = 0;
1438
+ bool check_overlap = !scan_block_handles->empty();
1439
+
1440
+ InternalKey start_key;
1441
+ const size_t timestamp_size =
1442
+ user_comparator_.user_comparator()->timestamp_size();
1443
+ if (timestamp_size == 0) {
1444
+ start_key = InternalKey(scan_opt.range.start.value(), kMaxSequenceNumber,
1445
+ kValueTypeForSeek);
1446
+ } else {
1447
+ std::string seek_key;
1448
+ AppendKeyWithMaxTimestamp(&seek_key, scan_opt.range.start.value(),
1449
+ timestamp_size);
1450
+ start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
1451
+ }
995
1452
  index_iter_->Seek(start_key.Encode());
996
- while (index_iter_->Valid() &&
1453
+ while (index_iter_->status().ok() && index_iter_->Valid() &&
997
1454
  (!scan_opt.range.limit.has_value() ||
998
- user_comparator_.CompareWithoutTimestamp(
999
- index_iter_->user_key(),
1000
- /*a_has_ts*/ true, *scan_opt.range.limit,
1001
- /*b_has_ts=*/false) <= 0)) {
1455
+ user_comparator_.CompareWithoutTimestamp(index_iter_->user_key(),
1456
+ /*a_has_ts*/ true,
1457
+ *scan_opt.range.limit,
1458
+ /*b_has_ts=*/false) < 0)) {
1459
+ // Only add the block if the index separator is smaller than limit. When
1460
+ // they are equal or larger, it will be handled later below.
1002
1461
  if (check_overlap &&
1003
- blocks_to_prepare.back() == index_iter_->value().handle) {
1462
+ scan_block_handles->back() == index_iter_->value().handle) {
1004
1463
  // Skip the current block since it's already in the list
1005
1464
  } else {
1006
- blocks_to_prepare.push_back(index_iter_->value().handle);
1465
+ scan_block_handles->push_back(index_iter_->value().handle);
1466
+ // clone the Slice to avoid the lifetime issue
1467
+ data_block_separators->push_back(index_iter_->user_key().ToString());
1007
1468
  }
1008
1469
  ++num_blocks;
1009
1470
  index_iter_->Next();
1010
1471
  check_overlap = false;
1011
1472
  }
1012
- // Stop until index->key > limit
1013
- // Include the current block since it can still contain keys <= limit
1473
+
1474
+ if (!index_iter_->status().ok()) {
1475
+ // Abort: index iterator error
1476
+ return index_iter_->status();
1477
+ }
1478
+
1014
1479
  if (index_iter_->Valid()) {
1480
+ // Handle the last block when its separator is equal or larger than limit
1015
1481
  if (check_overlap &&
1016
- blocks_to_prepare.back() == index_iter_->value().handle) {
1482
+ scan_block_handles->back() == index_iter_->value().handle) {
1017
1483
  // Skip adding the current block since it's already in the list
1018
1484
  } else {
1019
- blocks_to_prepare.push_back(index_iter_->value().handle);
1485
+ scan_block_handles->push_back(index_iter_->value().handle);
1486
+ data_block_separators->push_back(index_iter_->user_key().ToString());
1020
1487
  }
1021
1488
  ++num_blocks;
1022
1489
  }
1023
-
1024
- if (!index_iter_->status().ok()) {
1025
- // Abort: index iterator error
1026
- return;
1490
+ block_index_ranges_per_scan->emplace_back(
1491
+ scan_block_handles->size() - num_blocks, scan_block_handles->size());
1492
+ if (kVerbose) {
1493
+ printf("separators :");
1494
+ for (const auto& separator : *data_block_separators) {
1495
+ printf("%s, ", separator.c_str());
1496
+ }
1497
+ printf("\n");
1027
1498
  }
1028
-
1029
- block_ranges_per_scan.emplace_back(blocks_to_prepare.size() - num_blocks,
1030
- blocks_to_prepare.size());
1031
1499
  }
1500
+ return Status::OK();
1501
+ }
1032
1502
 
1033
- // blocks_to_prepare has all the blocks that need to be read.
1034
- // Look up entries in cache and pin if exist.
1035
- // Store indices of blocks to read.
1036
- std::vector<size_t> blocks_to_read;
1037
- std::vector<CachableEntry<Block>> pinned_data_blocks_guard;
1038
- pinned_data_blocks_guard.resize(blocks_to_prepare.size());
1039
- for (size_t i = 0; i < blocks_to_prepare.size(); ++i) {
1040
- const auto& data_block_handle = blocks_to_prepare[i];
1041
- s = table_->LookupAndPinBlocksInCache<Block_kData>(
1503
+ Status BlockBasedTableIterator::FilterAndPinCachedBlocks(
1504
+ const std::vector<BlockHandle>& scan_block_handles,
1505
+ const MultiScanArgs* multiscan_opts,
1506
+ std::vector<size_t>* block_indices_to_read,
1507
+ std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
1508
+ size_t* prefetched_max_idx) {
1509
+ uint64_t total_prefetch_size = 0;
1510
+ *prefetched_max_idx = scan_block_handles.size();
1511
+
1512
+ for (size_t i = 0; i < scan_block_handles.size(); ++i) {
1513
+ const auto& data_block_handle = scan_block_handles[i];
1514
+
1515
+ total_prefetch_size +=
1516
+ BlockBasedTable::BlockSizeWithTrailer(data_block_handle);
1517
+ if (multiscan_opts->max_prefetch_size > 0 &&
1518
+ total_prefetch_size > multiscan_opts->max_prefetch_size) {
1519
+ for (size_t j = i; j < scan_block_handles.size(); ++j) {
1520
+ assert((*pinned_data_blocks_guard)[j].IsEmpty());
1521
+ }
1522
+ *prefetched_max_idx = i;
1523
+ break;
1524
+ }
1525
+
1526
+ Status s = table_->LookupAndPinBlocksInCache<Block_kData>(
1042
1527
  read_options_, data_block_handle,
1043
- &pinned_data_blocks_guard[i].As<Block_kData>());
1528
+ &(*pinned_data_blocks_guard)[i].As<Block_kData>());
1044
1529
 
1045
1530
  if (!s.ok()) {
1046
1531
  // Abort: block cache look up failed.
1047
- return;
1532
+ return s;
1048
1533
  }
1049
- if (!pinned_data_blocks_guard[i].GetValue()) {
1050
- // Block not in cache, will read it below.
1051
- blocks_to_read.emplace_back(i);
1534
+ if (!(*pinned_data_blocks_guard)[i].GetValue()) {
1535
+ // Block not in cache
1536
+ block_indices_to_read->emplace_back(i);
1052
1537
  }
1053
1538
  }
1539
+ return Status::OK();
1540
+ }
1054
1541
 
1055
- // Coalesce IOs
1056
- // TODO: limit prefetching size to bound memory usage.
1057
- if (!blocks_to_read.empty()) {
1058
- // Each vector correspond to blocks to read in a single read request.
1059
- // Each member in the vector is an index into blocks_to_prepare.
1060
- std::vector<std::vector<size_t>> collapsed_blocks_to_read(1);
1061
-
1062
- // TODO: make this threshold configurable
1063
- constexpr size_t kCoalesceThreshold = 16 << 10; // 16KB
1064
-
1065
- for (const auto& block_idx : blocks_to_read) {
1066
- if (!collapsed_blocks_to_read.back().empty()) {
1067
- // Check if we can coalesce.
1068
- const auto& last_block =
1069
- blocks_to_prepare[collapsed_blocks_to_read.back().back()];
1070
- uint64_t last_block_end =
1071
- last_block.offset() +
1072
- BlockBasedTable::BlockSizeWithTrailer(last_block);
1073
- uint64_t current_start = blocks_to_prepare[block_idx].offset();
1542
+ void BlockBasedTableIterator::PrepareIORequests(
1543
+ const std::vector<size_t>& block_indices_to_read,
1544
+ const std::vector<BlockHandle>& scan_block_handles,
1545
+ const MultiScanArgs* multiscan_opts, std::vector<FSReadRequest>* read_reqs,
1546
+ UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
1547
+ std::vector<std::vector<size_t>>* coalesced_block_indices) {
1548
+ assert(coalesced_block_indices->empty());
1549
+ coalesced_block_indices->resize(1);
1550
+
1551
+ for (const auto& block_idx : block_indices_to_read) {
1552
+ if (!coalesced_block_indices->back().empty()) {
1553
+ // Check if we can coalesce.
1554
+ const auto& last_block_handle =
1555
+ scan_block_handles[coalesced_block_indices->back().back()];
1556
+ uint64_t last_block_end =
1557
+ last_block_handle.offset() +
1558
+ BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
1559
+ uint64_t current_start = scan_block_handles[block_idx].offset();
1560
+
1561
+ if (current_start >
1562
+ last_block_end + multiscan_opts->io_coalesce_threshold) {
1563
+ // new IO
1564
+ coalesced_block_indices->emplace_back();
1565
+ }
1566
+ }
1567
+ coalesced_block_indices->back().emplace_back(block_idx);
1568
+ }
1074
1569
 
1075
- if (current_start > last_block_end + kCoalesceThreshold) {
1076
- // new IO
1077
- collapsed_blocks_to_read.emplace_back();
1570
+ assert(read_reqs->empty());
1571
+ read_reqs->reserve(coalesced_block_indices->size());
1572
+ for (const auto& block_indices : *coalesced_block_indices) {
1573
+ assert(block_indices.size());
1574
+ const auto& first_block_handle = scan_block_handles[block_indices[0]];
1575
+ const auto& last_block_handle = scan_block_handles[block_indices.back()];
1576
+
1577
+ const auto start_offset = first_block_handle.offset();
1578
+ const auto end_offset =
1579
+ last_block_handle.offset() +
1580
+ BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
1581
+ #ifndef NDEBUG
1582
+ // Debug print for failing the assertion below.
1583
+ if (start_offset >= end_offset) {
1584
+ fprintf(stderr, "scan_block_handles: ");
1585
+ for (const auto& block : scan_block_handles) {
1586
+ fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
1587
+ block.offset(), block.size());
1588
+ }
1589
+ fprintf(stderr,
1590
+ "\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
1591
+ first_block_handle.offset(), first_block_handle.size());
1592
+ fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
1593
+ last_block_handle.offset(), last_block_handle.size());
1594
+
1595
+ fprintf(stderr, "coalesced_block_indices: ");
1596
+ for (const auto& b : *coalesced_block_indices) {
1597
+ fprintf(stderr, "[");
1598
+ for (const auto& block_idx : b) {
1599
+ fprintf(stderr, "%zu ", block_idx);
1078
1600
  }
1601
+ fprintf(stderr, "] ");
1079
1602
  }
1080
- collapsed_blocks_to_read.back().emplace_back(block_idx);
1603
+ fprintf(stderr, "\ncurrent blocks: ");
1604
+ for (const auto& block_idx : block_indices) {
1605
+ fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
1606
+ scan_block_handles[block_idx].offset(),
1607
+ scan_block_handles[block_idx].size());
1608
+ }
1609
+ fprintf(stderr, "\n");
1081
1610
  }
1611
+ #endif // NDEBUG
1612
+ assert(end_offset > start_offset);
1082
1613
 
1083
- // do IO
1084
- IOOptions io_opts;
1085
- s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
1086
- if (!s.ok()) {
1087
- // Abort: PrepareIOOptions failed
1088
- return;
1614
+ read_reqs->emplace_back();
1615
+ read_reqs->back().offset = start_offset;
1616
+ read_reqs->back().len = end_offset - start_offset;
1617
+
1618
+ if (multiscan_opts->use_async_io) {
1619
+ for (const auto& block_idx : block_indices) {
1620
+ (*block_idx_to_readreq_idx)[block_idx] = read_reqs->size() - 1;
1621
+ }
1089
1622
  }
1623
+ }
1624
+ }
1090
1625
 
1091
- // Init read requests for Multi-Read
1092
- std::vector<FSReadRequest> read_reqs;
1093
- read_reqs.reserve(collapsed_blocks_to_read.size());
1094
- size_t total_len = 0;
1095
- for (const auto& blocks : collapsed_blocks_to_read) {
1096
- assert(blocks.size());
1097
- const auto& first_block = blocks_to_prepare[blocks[0]];
1098
- const auto& last_block = blocks_to_prepare[blocks.back()];
1099
-
1100
- const auto start_offset = first_block.offset();
1101
- const auto end_offset = last_block.offset() +
1102
- BlockBasedTable::BlockSizeWithTrailer(last_block);
1103
- assert(end_offset > start_offset);
1104
- FSReadRequest read_req;
1105
- read_req.offset = start_offset;
1106
- read_req.len = end_offset - start_offset;
1107
- total_len += read_req.len;
1108
- read_reqs.emplace_back(std::move(read_req));
1109
- }
1110
-
1111
- // Init buffer for read
1626
+ Status BlockBasedTableIterator::ExecuteIO(
1627
+ const std::vector<BlockHandle>& scan_block_handles,
1628
+ const MultiScanArgs* multiscan_opts,
1629
+ const std::vector<std::vector<size_t>>& coalesced_block_indices,
1630
+ std::vector<FSReadRequest>* read_reqs,
1631
+ std::vector<AsyncReadState>* async_states,
1632
+ std::vector<CachableEntry<Block>>* pinned_data_blocks_guard) {
1633
+ IOOptions io_opts;
1634
+ Status s;
1635
+ s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
1636
+ if (!s.ok()) {
1637
+ // Abort: PrepareIOOptions failed
1638
+ return s;
1639
+ }
1640
+ const bool direct_io = table_->get_rep()->file->use_direct_io();
1641
+
1642
+ if (multiscan_opts->use_async_io) {
1643
+ async_states->resize(read_reqs->size());
1644
+ for (size_t i = 0; i < read_reqs->size(); ++i) {
1645
+ auto& read_req = (*read_reqs)[i];
1646
+ auto& async_read = (*async_states)[i];
1647
+
1648
+ async_read.finished = false;
1649
+ async_read.offset = read_req.offset;
1650
+ async_read.block_indices = coalesced_block_indices[i];
1651
+ for (const auto idx : coalesced_block_indices[i]) {
1652
+ async_read.blocks.emplace_back(scan_block_handles[idx]);
1653
+ }
1654
+
1655
+ if (direct_io) {
1656
+ read_req.scratch = nullptr;
1657
+ } else {
1658
+ async_read.buf.reset(new char[read_req.len]);
1659
+ read_req.scratch = async_read.buf.get();
1660
+ }
1661
+
1662
+ auto cb = std::bind(&BlockBasedTableIterator::PrepareReadAsyncCallBack,
1663
+ this, std::placeholders::_1, std::placeholders::_2);
1664
+ // TODO: for mmap, io_handle will not be set but callback will already
1665
+ // be called.
1666
+ s = table_->get_rep()->file.get()->ReadAsync(
1667
+ read_req, io_opts, cb, &async_read, &async_read.io_handle,
1668
+ &async_read.del_fn, direct_io ? &async_read.aligned_buf : nullptr);
1669
+ if (!s.ok()) {
1670
+ #ifndef NDEBUG
1671
+ fprintf(stderr, "ReadAsync failed with %s\n", s.ToString().c_str());
1672
+ #endif
1673
+ assert(false);
1674
+ return s;
1675
+ }
1676
+ assert(async_read.io_handle);
1677
+ for (auto& req : *read_reqs) {
1678
+ if (!req.status.ok()) {
1679
+ assert(false);
1680
+ // Silence compiler warning about NRVO
1681
+ s = req.status;
1682
+ return s;
1683
+ }
1684
+ }
1685
+ }
1686
+ } else {
1687
+ // Synchronous IO using MultiRead
1112
1688
  std::unique_ptr<char[]> buf;
1113
- const bool direct_io = table_->get_rep()->file->use_direct_io();
1689
+
1114
1690
  if (direct_io) {
1115
- for (auto& read_req : read_reqs) {
1691
+ for (auto& read_req : *read_reqs) {
1116
1692
  read_req.scratch = nullptr;
1117
1693
  }
1118
1694
  } else {
1119
1695
  // TODO: optimize if FSSupportedOps::kFSBuffer is supported.
1696
+ size_t total_len = 0;
1697
+ for (const auto& req : *read_reqs) {
1698
+ total_len += req.len;
1699
+ }
1120
1700
  buf.reset(new char[total_len]);
1121
1701
  size_t offset = 0;
1122
- for (auto& read_req : read_reqs) {
1702
+ for (auto& read_req : *read_reqs) {
1123
1703
  read_req.scratch = buf.get() + offset;
1124
1704
  offset += read_req.len;
1125
1705
  }
1126
1706
  }
1127
1707
 
1128
1708
  AlignedBuf aligned_buf;
1129
- s = table_->get_rep()->file.get()->MultiRead(
1130
- io_opts, read_reqs.data(), read_reqs.size(),
1131
- direct_io ? &aligned_buf : nullptr);
1709
+ s = table_->get_rep()->file->MultiRead(io_opts, read_reqs->data(),
1710
+ read_reqs->size(),
1711
+ direct_io ? &aligned_buf : nullptr);
1132
1712
  if (!s.ok()) {
1133
- return;
1713
+ return s;
1134
1714
  }
1135
- for (auto& req : read_reqs) {
1715
+ for (auto& req : *read_reqs) {
1136
1716
  if (!req.status.ok()) {
1137
- return;
1717
+ // Silence compiler warning about NRVO
1718
+ s = req.status;
1719
+ return s;
1138
1720
  }
1139
1721
  }
1140
1722
 
1141
1723
  // Init blocks and pin them in block cache.
1142
- MemoryAllocator* memory_allocator =
1143
- table_->get_rep()->table_options.block_cache->memory_allocator();
1144
- for (size_t i = 0; i < collapsed_blocks_to_read.size(); i++) {
1145
- const auto& blocks = collapsed_blocks_to_read[i];
1146
- const auto& read_req = read_reqs[i];
1147
- for (const auto& block_idx : blocks) {
1148
- const auto& block = blocks_to_prepare[block_idx];
1149
- const auto block_size_with_trailer =
1150
- BlockBasedTable::BlockSizeWithTrailer(block);
1151
- const auto block_offset_in_buffer = block.offset() - read_req.offset;
1152
-
1153
- CacheAllocationPtr data =
1154
- AllocateBlock(block_size_with_trailer, memory_allocator);
1155
- memcpy(data.get(), read_req.result.data() + block_offset_in_buffer,
1156
- block_size_with_trailer);
1157
- BlockContents tmp_contents(std::move(data), block.size());
1158
-
1159
- #ifndef NDEBUG
1160
- tmp_contents.has_trailer =
1161
- table_->get_rep()->footer.GetBlockTrailerSize() > 0;
1162
- #endif
1163
- assert(pinned_data_blocks_guard[block_idx].IsEmpty());
1164
- s = table_->CreateAndPinBlockInCache<Block_kData>(
1165
- read_options_, block, &tmp_contents,
1166
- &(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
1724
+ assert(read_reqs->size() == coalesced_block_indices.size());
1725
+ for (size_t i = 0; i < coalesced_block_indices.size(); i++) {
1726
+ const auto& read_req = (*read_reqs)[i];
1727
+ for (const auto& block_idx : coalesced_block_indices[i]) {
1728
+ const auto& block = scan_block_handles[block_idx];
1729
+
1730
+ assert((*pinned_data_blocks_guard)[block_idx].IsEmpty());
1731
+ s = CreateAndPinBlockFromBuffer(block, read_req.offset, read_req.result,
1732
+ (*pinned_data_blocks_guard)[block_idx]);
1167
1733
  if (!s.ok()) {
1734
+ assert(false);
1168
1735
  // Abort: failed to create and pin block in cache
1169
- return;
1736
+ return s;
1170
1737
  }
1738
+ assert((*pinned_data_blocks_guard)[block_idx].GetValue());
1171
1739
  }
1172
1740
  }
1173
1741
  }
1174
-
1175
- // Successful Prepare, init related states so the iterator reads from prepared
1176
- // blocks
1177
- multi_scan_.reset(new MultiScanState(scan_opts,
1178
- std::move(pinned_data_blocks_guard),
1179
- std::move(block_ranges_per_scan)));
1180
- is_index_at_curr_block_ = false;
1181
- block_iter_points_to_real_block_ = false;
1742
+ return s;
1182
1743
  }
1183
1744
 
1184
- bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
1185
- assert(multi_scan_);
1186
- // This is a MultiScan and Preapre() has been called.
1187
- //
1188
- // Validate seek key with scan options
1189
- if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
1190
- multi_scan_.reset();
1191
- } else if (!target) {
1192
- // start key must be set for multi-scan
1193
- multi_scan_.reset();
1194
- } else if (user_comparator_.CompareWithoutTimestamp(
1195
- ExtractUserKey(*target), /*a_has_ts=*/true,
1196
- (*multi_scan_->scan_opts)[multi_scan_->next_scan_idx]
1197
- .range.start.value(),
1198
- /*b_has_ts=*/false) != 0) {
1199
- // Unexpected seek key
1200
- multi_scan_.reset();
1201
- } else {
1202
- auto [cur_scan_start_idx, cur_scan_end_idx] =
1203
- multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx];
1204
- // We should have the data block already loaded
1205
- ++multi_scan_->next_scan_idx;
1206
- if (cur_scan_start_idx >= cur_scan_end_idx) {
1207
- is_out_of_bound_ = true;
1208
- assert(!Valid());
1209
- return true;
1210
- } else {
1211
- is_out_of_bound_ = false;
1212
- }
1213
-
1214
- if (!block_iter_points_to_real_block_ ||
1215
- multi_scan_->cur_data_block_idx != cur_scan_start_idx) {
1216
- if (block_iter_points_to_real_block_) {
1217
- // Should be scan in increasing key range.
1218
- // All blocks before cur_data_block_idx_ are not pinned anymore.
1219
- assert(multi_scan_->cur_data_block_idx < cur_scan_start_idx);
1220
- }
1221
-
1222
- ResetDataIter();
1223
- // Note that the block_iter_ takes ownership of the pinned data block
1224
- // TODO: we can delegate the clean up like with pinned_iters_mgr_ if
1225
- // need to pin blocks longer.
1226
- table_->NewDataBlockIterator<DataBlockIter>(
1227
- read_options_, multi_scan_->pinned_data_blocks[cur_scan_start_idx],
1228
- &block_iter_, Status::OK());
1229
- }
1230
- multi_scan_->cur_data_block_idx = cur_scan_start_idx;
1231
- block_iter_points_to_real_block_ = true;
1232
- block_iter_.Seek(*target);
1233
- FindKeyForward();
1234
- return true;
1235
- }
1236
-
1237
- return false;
1238
- }
1239
-
1240
- void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
1241
- assert(multi_scan_);
1242
- assert(multi_scan_->next_scan_idx >= 1);
1243
- const auto cur_scan_end_idx = std::get<1>(
1244
- multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
1245
- do {
1246
- if (!block_iter_.status().ok()) {
1247
- return;
1248
- }
1249
-
1250
- if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
1251
- // We don't ResetDataIter() here since next scan might be reading from
1252
- // the same block. ResetDataIter() will free the underlying block cache
1253
- // handle and we don't want the block to be unpinned.
1254
- is_out_of_bound_ = true;
1255
- assert(!Valid());
1256
- return;
1257
- }
1258
- // Move to the next pinned data block
1259
- ResetDataIter();
1260
- ++multi_scan_->cur_data_block_idx;
1261
- table_->NewDataBlockIterator<DataBlockIter>(
1262
- read_options_,
1263
- multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx],
1264
- &block_iter_, Status::OK());
1265
- block_iter_points_to_real_block_ = true;
1266
- block_iter_.SeekToFirst();
1267
- } while (!block_iter_.Valid());
1268
- }
1269
1745
  } // namespace ROCKSDB_NAMESPACE