@nxtedition/rocksdb 13.5.12 → 14.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/binding.cc +33 -2
  2. package/binding.gyp +2 -2
  3. package/chained-batch.js +9 -16
  4. package/deps/rocksdb/rocksdb/BUCK +18 -1
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
  6. package/deps/rocksdb/rocksdb/Makefile +20 -9
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +90 -13
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -75
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.h +44 -36
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +184 -148
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +5 -11
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +116 -47
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +3 -6
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -1
  16. package/deps/rocksdb/rocksdb/db/builder.cc +4 -2
  17. package/deps/rocksdb/rocksdb/db/c.cc +207 -0
  18. package/deps/rocksdb/rocksdb/db/c_test.c +72 -0
  19. package/deps/rocksdb/rocksdb/db/column_family.cc +3 -2
  20. package/deps/rocksdb/rocksdb/db/column_family.h +5 -0
  21. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +2 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +51 -38
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +29 -12
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +5 -10
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +566 -366
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +131 -4
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +1 -0
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +4 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +13 -14
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +12 -7
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -10
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +97 -76
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +11 -14
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +16 -3
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +1 -0
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +448 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +22 -20
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +4 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +5 -5
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +7 -3
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_iter.cc +104 -0
  47. package/deps/rocksdb/rocksdb/db/db_iter.h +4 -11
  48. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +331 -58
  49. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +129 -0
  50. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +64 -0
  51. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +40 -0
  52. package/deps/rocksdb/rocksdb/db/db_test2.cc +25 -15
  53. package/deps/rocksdb/rocksdb/db/db_test_util.cc +42 -24
  54. package/deps/rocksdb/rocksdb/db/db_test_util.h +29 -14
  55. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +69 -36
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +0 -1
  57. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  58. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -4
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +8 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +275 -79
  61. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +23 -5
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +591 -175
  63. package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -4
  64. package/deps/rocksdb/rocksdb/db/log_reader.cc +5 -2
  65. package/deps/rocksdb/rocksdb/db/memtable.cc +84 -35
  66. package/deps/rocksdb/rocksdb/db/memtable.h +39 -34
  67. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -0
  68. package/deps/rocksdb/rocksdb/db/merge_operator.cc +1 -1
  69. package/deps/rocksdb/rocksdb/db/multi_scan.cc +11 -5
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +1 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -1
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +34 -14
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +28 -5
  74. package/deps/rocksdb/rocksdb/db/version_set.cc +159 -14
  75. package/deps/rocksdb/rocksdb/db/version_set.h +2 -0
  76. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -1
  77. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +60 -0
  78. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +16 -1
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +75 -10
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.cc +28 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +2 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +31 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +50 -2
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +57 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +0 -4
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +266 -35
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -6
  89. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +18 -2
  90. package/deps/rocksdb/rocksdb/env/env.cc +12 -0
  91. package/deps/rocksdb/rocksdb/env/env_test.cc +18 -0
  92. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +2 -0
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +9 -5
  94. package/deps/rocksdb/rocksdb/env/io_posix.cc +4 -2
  95. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +19 -0
  96. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -31
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +42 -9
  98. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +93 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +43 -49
  100. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +4 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +8 -6
  102. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +487 -0
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +11 -12
  104. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +135 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +12 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +8 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +12 -8
  110. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +3 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +19 -9
  112. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +219 -24
  113. package/deps/rocksdb/rocksdb/include/rocksdb/point_lock_bench_tool.h +14 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +2 -2
  115. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +1 -1
  116. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +7 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +16 -0
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +16 -4
  119. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +13 -0
  120. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +4 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +0 -2
  122. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +45 -0
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +1 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +1 -1
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +6 -1
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  128. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +3 -3
  129. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +77 -51
  130. package/deps/rocksdb/rocksdb/memtable/skiplist.h +10 -13
  131. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +16 -7
  132. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +9 -4
  133. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +2 -0
  134. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +6 -0
  135. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -1
  136. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  137. package/deps/rocksdb/rocksdb/options/options.cc +2 -0
  138. package/deps/rocksdb/rocksdb/options/options_helper.cc +9 -8
  139. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -5
  140. package/deps/rocksdb/rocksdb/port/mmap.cc +1 -1
  141. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +51 -0
  142. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +4 -0
  143. package/deps/rocksdb/rocksdb/src.mk +8 -2
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1125 -765
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +35 -24
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +29 -4
  147. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +732 -256
  148. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +225 -16
  149. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -26
  150. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -1
  151. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -75
  152. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +433 -141
  153. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +2 -0
  154. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +17 -10
  155. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy_impl.h +20 -0
  156. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +112 -85
  157. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +191 -36
  158. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +2 -2
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  160. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +108 -31
  161. package/deps/rocksdb/rocksdb/table/external_table.cc +7 -3
  162. package/deps/rocksdb/rocksdb/table/format.cc +6 -12
  163. package/deps/rocksdb/rocksdb/table/format.h +10 -0
  164. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  165. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +1 -1
  166. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -1
  167. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +5 -0
  168. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -1
  169. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +118 -46
  170. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +9 -8
  171. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  172. package/deps/rocksdb/rocksdb/table/table_properties.cc +16 -0
  173. package/deps/rocksdb/rocksdb/table/table_test.cc +1540 -155
  174. package/deps/rocksdb/rocksdb/test_util/testutil.h +21 -5
  175. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +26 -5
  176. package/deps/rocksdb/rocksdb/tools/ldb.cc +1 -2
  177. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +2 -0
  178. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -3
  179. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +133 -165
  180. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +173 -64
  181. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +69 -0
  182. package/deps/rocksdb/rocksdb/util/atomic.h +6 -0
  183. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +29 -20
  184. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +10 -6
  185. package/deps/rocksdb/rocksdb/util/bit_fields.h +338 -0
  186. package/deps/rocksdb/rocksdb/util/coding.h +3 -3
  187. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -2
  188. package/deps/rocksdb/rocksdb/util/compression.cc +777 -82
  189. package/deps/rocksdb/rocksdb/util/compression.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/compression_test.cc +5 -3
  191. package/deps/rocksdb/rocksdb/util/dynamic_bloom.cc +2 -2
  192. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +15 -14
  193. package/deps/rocksdb/rocksdb/util/interval_test.cc +102 -0
  194. package/deps/rocksdb/rocksdb/util/semaphore.h +164 -0
  195. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +10 -6
  196. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -2
  197. package/deps/rocksdb/rocksdb/util/slice_test.cc +136 -0
  198. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  199. package/deps/rocksdb/rocksdb/util/string_util.cc +2 -16
  200. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +1 -1
  201. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
  202. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +7 -4
  203. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +35 -14
  204. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_test.cc +2 -0
  205. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +5 -2
  206. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/any_lock_manager_test.h +244 -0
  207. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench.cc +18 -0
  208. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench_tool.cc +159 -0
  209. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +1244 -161
  210. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +66 -12
  211. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_stress_test.cc +103 -0
  212. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +1275 -8
  213. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +40 -262
  214. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test_common.h +78 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_validation_test_runner.h +469 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -6
  217. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +4 -0
  218. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +9 -1
  219. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +18 -9
  220. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +2 -0
  221. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +2 -1
  222. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +72 -44
  223. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +92 -15
  224. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +6 -20
  225. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +143 -112
  226. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +23 -16
  227. package/index.js +3 -3
  228. package/package.json +1 -1
  229. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  230. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  231. package/util.h +38 -12
  232. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc +0 -17
@@ -22,12 +22,16 @@
22
22
  #include "rocksdb/options.h"
23
23
  #include "table/block_based/block_based_table_builder.h"
24
24
  #include "table/block_based/block_based_table_factory.h"
25
+ #include "table/block_based/block_based_table_iterator.h"
25
26
  #include "table/block_based/partitioned_index_iterator.h"
26
27
  #include "table/format.h"
27
28
  #include "test_util/testharness.h"
28
29
  #include "test_util/testutil.h"
29
30
  #include "util/random.h"
30
31
 
32
+ // Enable io_uring support for this test
33
+ extern "C" bool RocksDbIOUringEnable() { return true; }
34
+
31
35
  namespace ROCKSDB_NAMESPACE {
32
36
 
33
37
  class BlockBasedTableReaderBaseTest : public testing::Test {
@@ -49,7 +53,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
49
53
  // user defined timestamps and different sequence number to differentiate them
50
54
  static std::vector<std::pair<std::string, std::string>> GenerateKVMap(
51
55
  int num_block = 2, bool mixed_with_human_readable_string_value = false,
52
- size_t ts_sz = 0, bool same_key_diff_ts = false) {
56
+ size_t ts_sz = 0, bool same_key_diff_ts = false,
57
+ const Comparator* comparator = BytewiseComparator()) {
53
58
  std::vector<std::pair<std::string, std::string>> kv;
54
59
 
55
60
  SequenceNumber seq_no = 0;
@@ -97,6 +102,10 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
97
102
  }
98
103
  }
99
104
  }
105
+ auto comparator_name = std::string(comparator->Name());
106
+ if (comparator_name.find("Reverse") != std::string::npos) {
107
+ std::reverse(kv.begin(), kv.end());
108
+ }
100
109
  return kv;
101
110
  }
102
111
 
@@ -125,6 +134,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
125
134
 
126
135
  InternalKeyComparator comparator(ioptions.user_comparator);
127
136
  ColumnFamilyOptions cf_options;
137
+ cf_options.comparator = ioptions.user_comparator;
128
138
  cf_options.prefix_extractor = options_.prefix_extractor;
129
139
  MutableCFOptions moptions(cf_options);
130
140
  CompressionOptions compression_opts;
@@ -169,8 +179,9 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
169
179
  false /* _force_direct_prefetch */, -1 /* _level */,
170
180
  nullptr /* _block_cache_tracer */,
171
181
  0 /* _max_file_size_for_l0_meta_pin */, "" /* _cur_db_session_id */,
172
- 0 /* _cur_file_num */, {} /* _unique_id */, 0 /* _largest_seqno */,
173
- 0 /* _tail_size */, user_defined_timestamps_persisted);
182
+ table_num_++ /* _cur_file_num */, {} /* _unique_id */,
183
+ 0 /* _largest_seqno */, 0 /* _tail_size */,
184
+ user_defined_timestamps_persisted);
174
185
 
175
186
  std::unique_ptr<RandomAccessFileReader> file;
176
187
  NewFileReader(table_name, foptions, &file, ioptions.statistics.get());
@@ -202,6 +213,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
202
213
  Env* env_;
203
214
  std::shared_ptr<FileSystem> fs_;
204
215
  Options options_;
216
+ uint64_t table_num_{0};
205
217
 
206
218
  private:
207
219
  void WriteToFile(const std::string& content, const std::string& filename) {
@@ -250,11 +262,13 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
250
262
  // generate keys with different user provided key, same user-defined
251
263
  // timestamps (if udt enabled), same sequence number. This test mode is
252
264
  // used for testing `Get`, `MultiGet`, and `NewIterator`.
265
+ // Param 9: test both the default comparator and a reverse comparator.
253
266
  class BlockBasedTableReaderTest
254
267
  : public BlockBasedTableReaderBaseTest,
255
- public testing::WithParamInterface<std::tuple<
256
- CompressionType, bool, BlockBasedTableOptions::IndexType, bool,
257
- test::UserDefinedTimestampTestMode, uint32_t, uint32_t, bool>> {
268
+ public testing::WithParamInterface<
269
+ std::tuple<CompressionType, bool, BlockBasedTableOptions::IndexType,
270
+ bool, test::UserDefinedTimestampTestMode, uint32_t,
271
+ uint32_t, bool, const Comparator*>> {
258
272
  protected:
259
273
  void SetUp() override {
260
274
  compression_type_ = std::get<0>(GetParam());
@@ -265,6 +279,7 @@ class BlockBasedTableReaderTest
265
279
  compression_parallel_threads_ = std::get<5>(GetParam());
266
280
  compression_dict_bytes_ = std::get<6>(GetParam());
267
281
  same_key_diff_ts_ = std::get<7>(GetParam());
282
+ comparator_ = std::get<8>(GetParam());
268
283
  BlockBasedTableReaderBaseTest::SetUp();
269
284
  }
270
285
 
@@ -290,6 +305,7 @@ class BlockBasedTableReaderTest
290
305
  uint32_t compression_parallel_threads_;
291
306
  uint32_t compression_dict_bytes_;
292
307
  bool same_key_diff_ts_;
308
+ const Comparator* comparator_{};
293
309
  };
294
310
 
295
311
  class BlockBasedTableReaderGetTest : public BlockBasedTableReaderTest {};
@@ -993,17 +1009,192 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
993
1009
  ASSERT_EQ(s.code(), Status::kCorruption);
994
1010
  }
995
1011
 
1012
+ // TODO: test no block cache case
996
1013
  TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
1014
+ std::ostringstream param_trace;
1015
+ param_trace << "[MultiScanPrepare] Test params: " << "CompressionType="
1016
+ << CompressionTypeToString(compression_type_)
1017
+ << ", UseDirectReads=" << (use_direct_reads_ ? "true" : "false")
1018
+ << ", UDTEnabled=" << (udt_enabled_ ? "true" : "false")
1019
+ << ", PersistUDT=" << (persist_udt_ ? "true" : "false")
1020
+ << ", CompressionParallelThreads="
1021
+ << compression_parallel_threads_
1022
+ << ", CompressionDictBytes=" << compression_dict_bytes_
1023
+ << ", SameKeyDiffTs=" << (same_key_diff_ts_ ? "true" : "false");
1024
+ SCOPED_TRACE(param_trace.str());
1025
+
1026
+ for (bool fill_cache : {false, true}) {
1027
+ SCOPED_TRACE(std::string("fill_cache=") + std::to_string(fill_cache));
1028
+ for (bool use_async_io : {false,
1029
+ #ifdef ROCKSDB_IOURING_PRESENT
1030
+ true
1031
+ #endif
1032
+ }) {
1033
+ SCOPED_TRACE(std::string("use_async_io=") + std::to_string(use_async_io));
1034
+ Options options;
1035
+ options.statistics = CreateDBStatistics();
1036
+ options.comparator = comparator_;
1037
+ std::shared_ptr<FileSystem> fs = options.env->GetFileSystem();
1038
+ ReadOptions read_opts;
1039
+ read_opts.fill_cache = fill_cache;
1040
+ size_t ts_sz = options.comparator->timestamp_size();
1041
+ std::vector<std::pair<std::string, std::string>> kv =
1042
+ BlockBasedTableReaderBaseTest::GenerateKVMap(
1043
+ 100 /* num_block */,
1044
+ true /* mixed_with_human_readable_string_value */, ts_sz,
1045
+ same_key_diff_ts_, comparator_);
1046
+ std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
1047
+ CompressionTypeToString(compression_type_) +
1048
+ "_async" + std::to_string(use_async_io);
1049
+ ImmutableOptions ioptions(options);
1050
+ CreateTable(table_name, ioptions, compression_type_, kv,
1051
+ compression_parallel_threads_, compression_dict_bytes_);
1052
+
1053
+ std::unique_ptr<BlockBasedTable> table;
1054
+ FileOptions foptions;
1055
+ foptions.use_direct_reads = use_direct_reads_;
1056
+ InternalKeyComparator comparator(options.comparator);
1057
+ NewBlockBasedTableReader(
1058
+ foptions, ioptions, comparator, table_name, &table,
1059
+ true /* bool prefetch_index_and_filter_in_cache */,
1060
+ nullptr /* status */, persist_udt_);
1061
+
1062
+ // 1. Should coalesce into a single I/O
1063
+ std::unique_ptr<InternalIterator> iter;
1064
+ iter.reset(table->NewIterator(
1065
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1066
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1067
+
1068
+ MultiScanArgs scan_options(comparator_);
1069
+ scan_options.use_async_io = use_async_io;
1070
+ scan_options.insert(ExtractUserKey(kv[0].first),
1071
+ ExtractUserKey(kv[kEntriesPerBlock].first));
1072
+ scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
1073
+ ExtractUserKey(kv[3 * kEntriesPerBlock].first));
1074
+ auto read_count_before =
1075
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1076
+
1077
+ iter->Prepare(&scan_options);
1078
+ iter->Seek(kv[0].first);
1079
+ for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
1080
+ ASSERT_TRUE(iter->status().ok()) << iter->status().ToString();
1081
+ ASSERT_TRUE(iter->Valid()) << i;
1082
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1083
+ iter->Next();
1084
+ }
1085
+ // Iter may still be valid after scan range. Upper layer (DBIter) handles
1086
+ // exact upper bound checking. So we don't check !iter->Valid() here.
1087
+ ASSERT_OK(iter->status());
1088
+ iter->Seek(kv[2 * kEntriesPerBlock].first);
1089
+ for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
1090
+ ASSERT_TRUE(iter->Valid());
1091
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1092
+ iter->Next();
1093
+ }
1094
+ ASSERT_OK(iter->status());
1095
+ auto read_count_after =
1096
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1097
+ ASSERT_EQ(read_count_before + 1, read_count_after);
1098
+
1099
+ // 2. No IO coalesce, should do MultiRead/ReadAsync with 2 read requests.
1100
+ iter.reset(table->NewIterator(
1101
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1102
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1103
+ scan_options = MultiScanArgs(comparator_);
1104
+ scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
1105
+ ExtractUserKey(kv[75 * kEntriesPerBlock].first));
1106
+ scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
1107
+ ExtractUserKey(kv[95 * kEntriesPerBlock].first));
1108
+
1109
+ read_count_before =
1110
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1111
+ iter->Prepare(&scan_options);
1112
+
1113
+ iter->Seek(kv[70 * kEntriesPerBlock].first);
1114
+ for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
1115
+ ASSERT_TRUE(iter->Valid());
1116
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1117
+ iter->Next();
1118
+ }
1119
+ ASSERT_OK(iter->status());
1120
+ iter->Seek(kv[90 * kEntriesPerBlock].first);
1121
+ for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
1122
+ ASSERT_TRUE(iter->Valid());
1123
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1124
+ iter->Next();
1125
+ }
1126
+ ASSERT_OK(iter->status());
1127
+
1128
+ read_count_after =
1129
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1130
+ ASSERT_EQ(read_count_before + 2, read_count_after);
1131
+
1132
+ iter.reset(table->NewIterator(
1133
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1134
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1135
+
1136
+ // 3. Tests I/O excludes blocks already in cache.
1137
+ // Reading blocks from 50-99
1138
+ // From reads above, blocks 70-75 and 90-95 already in cache
1139
+ // So we should read 50-70 76-89 96-99 in three I/Os.
1140
+ // If fill_cache is false, then we'll do one giant I/O.
1141
+ scan_options = MultiScanArgs(comparator_);
1142
+ scan_options.use_async_io = use_async_io;
1143
+ scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
1144
+ read_count_before =
1145
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1146
+ iter->Prepare(&scan_options);
1147
+ read_count_after =
1148
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1149
+ if (!use_async_io) {
1150
+ if (!fill_cache) {
1151
+ ASSERT_EQ(read_count_before + 1, read_count_after);
1152
+ } else {
1153
+ ASSERT_EQ(read_count_before + 3, read_count_after);
1154
+ }
1155
+ } else {
1156
+ // stat is recorded in async callback which happens in Poll(), and
1157
+ // Poll() happens during scanning.
1158
+ ASSERT_EQ(read_count_before, read_count_after);
1159
+ }
1160
+
1161
+ iter->Seek(kv[50 * kEntriesPerBlock].first);
1162
+ for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
1163
+ ASSERT_TRUE(iter->Valid());
1164
+ ASSERT_EQ(iter->key().ToString(), kv[i].first);
1165
+ iter->Next();
1166
+ }
1167
+ ASSERT_FALSE(iter->Valid());
1168
+ ASSERT_OK(iter->status());
1169
+ read_count_after =
1170
+ options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1171
+ if (!fill_cache) {
1172
+ ASSERT_EQ(read_count_before + 1, read_count_after);
1173
+ } else {
1174
+ ASSERT_EQ(read_count_before + 3, read_count_after);
1175
+ }
1176
+ }
1177
+ }
1178
+ }
1179
+
1180
+ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
1181
+ if (compression_type_ != kNoCompression) {
1182
+ // This test relies on block sizes to be close to what's set in option.
1183
+ ROCKSDB_GTEST_BYPASS("This test assumes no compression.");
1184
+ return;
1185
+ }
997
1186
  Options options;
998
- options.statistics = CreateDBStatistics();
1187
+ options.comparator = comparator_;
999
1188
  ReadOptions read_opts;
1000
1189
  size_t ts_sz = options.comparator->timestamp_size();
1190
+
1191
+ // Generate data that spans multiple blocks
1001
1192
  std::vector<std::pair<std::string, std::string>> kv =
1002
1193
  BlockBasedTableReaderBaseTest::GenerateKVMap(
1003
- 100 /* num_block */,
1004
- true /* mixed_with_human_readable_string_value */, ts_sz);
1194
+ 20 /* num_block */, true /* mixed_with_human_readable_string_value */,
1195
+ ts_sz, same_key_diff_ts_, comparator_);
1005
1196
 
1006
- std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
1197
+ std::string table_name = "BlockBasedTableReaderTest_PrefetchSizeLimit" +
1007
1198
  CompressionTypeToString(compression_type_);
1008
1199
 
1009
1200
  ImmutableOptions ioptions(options);
@@ -1012,159 +1203,257 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
1012
1203
 
1013
1204
  std::unique_ptr<BlockBasedTable> table;
1014
1205
  FileOptions foptions;
1015
- foptions.use_direct_reads = true;
1206
+ foptions.use_direct_reads = use_direct_reads_;
1016
1207
  InternalKeyComparator comparator(options.comparator);
1017
1208
  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
1018
1209
  true /* bool prefetch_index_and_filter_in_cache */,
1019
1210
  nullptr /* status */, persist_udt_);
1020
1211
 
1021
- std::unique_ptr<InternalIterator> iter;
1022
- iter.reset(table->NewIterator(
1023
- read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1024
- /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1212
+ // Default block size is 4KB
1213
+ //
1214
+ // Tests when no block is loaded
1215
+ {
1216
+ std::unique_ptr<InternalIterator> iter;
1217
+ iter.reset(table->NewIterator(
1218
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1219
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1220
+
1221
+ MultiScanArgs scan_options(comparator_);
1222
+ scan_options.max_prefetch_size = 1024; // less than block size
1223
+ scan_options.insert(ExtractUserKey(kv[0].first),
1224
+ ExtractUserKey(kv[5].first));
1225
+
1226
+ iter->Prepare(&scan_options);
1227
+
1228
+ // Should be able to scan the first block, but not more
1229
+ iter->Seek(kv[0].first);
1230
+ ASSERT_FALSE(iter->Valid());
1231
+ ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
1232
+ }
1025
1233
 
1026
- // Should coalesce into a single I/O
1027
- std::vector<ScanOptions> scan_options(
1028
- {ScanOptions(ExtractUserKey(kv[0].first),
1029
- ExtractUserKey(kv[kEntriesPerBlock].first)),
1030
- ScanOptions(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
1031
- ExtractUserKey(kv[3 * kEntriesPerBlock].first))});
1234
+ // Some blocks are loaded
1235
+ {
1236
+ std::unique_ptr<InternalIterator> iter;
1237
+ iter.reset(table->NewIterator(
1238
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1239
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1240
+
1241
+ MultiScanArgs scan_options(comparator_);
1242
+ scan_options.max_prefetch_size = 9 * 1024; // 9KB - 2 blocks with buffer
1243
+ scan_options.insert(ExtractUserKey(kv[1 * kEntriesPerBlock].first),
1244
+ ExtractUserKey(kv[8 * kEntriesPerBlock].first));
1245
+
1246
+ iter->Prepare(&scan_options);
1247
+ iter->Seek(kv[1 * kEntriesPerBlock].first);
1248
+ size_t scanned_keys = 0;
1249
+
1250
+ // Should be able to scan up to 2 blocks worth of data
1251
+ while (iter->Valid()) {
1252
+ ASSERT_EQ(iter->key().ToString(),
1253
+ kv[scanned_keys + 1 * kEntriesPerBlock].first);
1254
+ iter->Next();
1255
+ scanned_keys++;
1256
+ }
1032
1257
 
1033
- auto read_count_before =
1034
- options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1035
- iter->Prepare(&scan_options);
1036
- auto read_count_after =
1037
- options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1038
- ASSERT_EQ(read_count_before + 1, read_count_after);
1039
- iter->Seek(kv[0].first);
1040
- for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
1041
- ASSERT_TRUE(iter->Valid());
1042
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1043
- iter->Next();
1258
+ ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
1259
+ ASSERT_EQ(scanned_keys, 2 * kEntriesPerBlock);
1044
1260
  }
1045
- // Iter may still be valid after scan range. Upper layer (DBIter) handles
1046
- // exact upper bound checking. So we don't check !iter->Valid() here.
1047
- ASSERT_OK(iter->status());
1048
- iter->Seek(kv[2 * kEntriesPerBlock].first);
1049
- for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
1050
- ASSERT_TRUE(iter->Valid());
1051
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1052
- iter->Next();
1053
- }
1054
- ASSERT_OK(iter->status());
1055
1261
 
1056
- iter.reset(table->NewIterator(
1057
- read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1058
- /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1059
- // No IO coalesce, should do MultiRead with 2 read requests.
1060
- scan_options = {ScanOptions(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
1061
- ExtractUserKey(kv[75 * kEntriesPerBlock].first)),
1062
- ScanOptions(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
1063
- ExtractUserKey(kv[95 * kEntriesPerBlock].first))};
1064
- read_count_before =
1065
- options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1066
- iter->Prepare(&scan_options);
1067
- read_count_after =
1068
- options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1069
- ASSERT_EQ(read_count_before + 2, read_count_after);
1070
-
1071
- iter->Seek(kv[70 * kEntriesPerBlock].first);
1072
- for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
1073
- ASSERT_TRUE(iter->Valid());
1074
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1075
- iter->Next();
1262
+ // Tests with some block loaded in cache already:
1263
+ // Blocks 1 and 2 are already in cache by the above test.
1264
+ // Here we try blocks 0 - 5, with prefetch limit to 3 blocks, and expect to
1265
+ // read 3 blocks.
1266
+ {
1267
+ std::unique_ptr<InternalIterator> iter;
1268
+ iter.reset(table->NewIterator(
1269
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1270
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1271
+
1272
+ MultiScanArgs scan_options(comparator_);
1273
+ scan_options.max_prefetch_size = 3 * 4 * 1024 + 1024; // 3 blocks + 1KB
1274
+ scan_options.insert(ExtractUserKey(kv[0].first),
1275
+ ExtractUserKey(kv[5 * kEntriesPerBlock].first));
1276
+
1277
+ iter->Prepare(&scan_options);
1278
+ iter->Seek(kv[0].first);
1279
+ size_t scanned_keys = 0;
1280
+ // Should only read 3 blocks (blocks 0, 1, 2)
1281
+ // already cached.
1282
+ while (iter->Valid()) {
1283
+ ASSERT_EQ(iter->key().ToString(), kv[scanned_keys].first);
1284
+ iter->Next();
1285
+ scanned_keys++;
1286
+ }
1287
+ ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
1288
+ ASSERT_EQ(scanned_keys, 3 * kEntriesPerBlock);
1076
1289
  }
1077
- ASSERT_OK(iter->status());
1078
- iter->Seek(kv[90 * kEntriesPerBlock].first);
1079
- for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
1080
- ASSERT_TRUE(iter->Valid());
1081
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1082
- iter->Next();
1290
+
1291
+ // Multiple scan ranges with prefetch limit
1292
+ {
1293
+ std::unique_ptr<InternalIterator> iter;
1294
+ iter.reset(table->NewIterator(
1295
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1296
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1297
+
1298
+ MultiScanArgs scan_options(comparator_);
1299
+ scan_options.max_prefetch_size = 5 * 4 * 1024 + 1024; // 5 blocks + 1KB
1300
+ // Will read 5 entries from first scan range, and 4 blocks from the second
1301
+ // scan range
1302
+ scan_options.insert(ExtractUserKey(kv[0].first),
1303
+ ExtractUserKey(kv[5].first));
1304
+ scan_options.insert(ExtractUserKey(kv[12 * kEntriesPerBlock].first),
1305
+ ExtractUserKey(kv[17 * kEntriesPerBlock].first));
1306
+ scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first),
1307
+ ExtractUserKey(kv[19 * kEntriesPerBlock].first));
1308
+
1309
+ iter->Prepare(&scan_options);
1310
+
1311
+ iter->Seek(kv[0].first);
1312
+ size_t scanned_keys = 0;
1313
+ size_t key_idx = 0;
1314
+ while (iter->Valid()) {
1315
+ ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
1316
+ iter->Next();
1317
+ scanned_keys++;
1318
+ key_idx++;
1319
+ if (key_idx == 5) {
1320
+ iter->Seek(kv[12 * kEntriesPerBlock].first);
1321
+ key_idx = 12 * kEntriesPerBlock;
1322
+ }
1323
+ }
1324
+ ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock);
1325
+ ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
1083
1326
  }
1084
- ASSERT_OK(iter->status());
1085
1327
 
1086
- iter.reset(table->NewIterator(
1087
- read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1088
- /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1089
- // Should do two I/Os since blocks 80-81 and 90-95 are already in block cache,
1090
- // reads from blocks 50-79 and 82-.. are co
1091
- scan_options = {ScanOptions(ExtractUserKey(kv[50 * kEntriesPerBlock].first))};
1092
- read_count_before =
1093
- options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1094
- iter->Prepare(&scan_options);
1095
- read_count_after =
1096
- options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
1097
- ASSERT_EQ(read_count_before + 3, read_count_after);
1098
- iter->Seek(kv[50 * kEntriesPerBlock].first);
1099
- for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
1100
- ASSERT_TRUE(iter->Valid());
1101
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1102
- iter->Next();
1328
+ // Prefetch limit is big enough for all scan ranges.
1329
+ {
1330
+ std::unique_ptr<InternalIterator> iter;
1331
+ iter.reset(table->NewIterator(
1332
+ read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1333
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1334
+
1335
+ MultiScanArgs scan_options(comparator_);
1336
+ scan_options.max_prefetch_size = 10 * 1024 * 1024; // 10MB
1337
+ scan_options.insert(ExtractUserKey(kv[0].first),
1338
+ ExtractUserKey(kv[5].first));
1339
+ scan_options.insert(ExtractUserKey(kv[8 * kEntriesPerBlock].first),
1340
+ ExtractUserKey(kv[12 * kEntriesPerBlock].first));
1341
+ scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first),
1342
+ ExtractUserKey(kv[19 * kEntriesPerBlock].first));
1343
+
1344
+ iter->Prepare(&scan_options);
1345
+
1346
+ iter->Seek(kv[0].first);
1347
+ size_t scanned_keys = 0;
1348
+ size_t key_idx = 0;
1349
+ // Scan first range
1350
+ while (iter->Valid() && key_idx < 5) {
1351
+ ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
1352
+ iter->Next();
1353
+ scanned_keys++;
1354
+ key_idx++;
1355
+ }
1356
+ // Move to second range
1357
+ iter->Seek(kv[8 * kEntriesPerBlock].first);
1358
+ key_idx = 8 * kEntriesPerBlock;
1359
+ while (iter->Valid() && key_idx < 12 * kEntriesPerBlock) {
1360
+ ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
1361
+ iter->Next();
1362
+ scanned_keys++;
1363
+ key_idx++;
1364
+ }
1365
+ // Move to third range
1366
+ iter->Seek(kv[18 * kEntriesPerBlock].first);
1367
+ key_idx = 18 * kEntriesPerBlock;
1368
+ while (iter->Valid() && key_idx < 19 * kEntriesPerBlock) {
1369
+ ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
1370
+ iter->Next();
1371
+ scanned_keys++;
1372
+ key_idx++;
1373
+ }
1374
+ // Should not hit prefetch limit
1375
+ ASSERT_OK(iter->status());
1376
+ ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock + 1 * kEntriesPerBlock);
1103
1377
  }
1104
- ASSERT_FALSE(iter->Valid());
1105
- ASSERT_OK(iter->status());
1378
+ }
1379
+
1380
+ TEST_P(BlockBasedTableReaderTest, MultiScanUnpinPreviousBlocks) {
1381
+ std::vector<std::pair<std::string, std::string>> kv =
1382
+ BlockBasedTableReaderBaseTest::GenerateKVMap(
1383
+ 30 /* num_block */,
1384
+ true /* mixed_with_human_readable_string_value */);
1385
+ std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" +
1386
+ CompressionTypeToString(compression_type_);
1387
+ ImmutableOptions ioptions(options_);
1388
+ CreateTable(table_name, ioptions, compression_type_, kv,
1389
+ compression_parallel_threads_, compression_dict_bytes_);
1390
+
1391
+ std::unique_ptr<BlockBasedTable> table;
1392
+ FileOptions foptions;
1393
+ foptions.use_direct_reads = use_direct_reads_;
1394
+ InternalKeyComparator comparator(options_.comparator);
1395
+ NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
1396
+ true /* bool prefetch_index_and_filter_in_cache */,
1397
+ nullptr /* status */, persist_udt_);
1106
1398
 
1107
- // Check cases when Seek key does not match start key in ScanOptions
1399
+ ReadOptions read_opts;
1400
+ std::unique_ptr<InternalIterator> iter;
1108
1401
  iter.reset(table->NewIterator(
1109
1402
  read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1110
1403
  /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1111
- scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
1112
- ExtractUserKey(kv[20 * kEntriesPerBlock].first)),
1113
- ScanOptions(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
1114
- ExtractUserKey(kv[40 * kEntriesPerBlock].first))};
1404
+
1405
+ MultiScanArgs scan_options(BytewiseComparator());
1406
+ // Range 1: block 0-4, Range 2: block 4-4, Range 3: block 5-15
1407
+ scan_options.insert(ExtractUserKey(kv[0 * kEntriesPerBlock].first),
1408
+ ExtractUserKey(kv[5 * kEntriesPerBlock - 5].first));
1409
+ scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 4].first),
1410
+ ExtractUserKey(kv[5 * kEntriesPerBlock - 3].first));
1411
+ scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 2].first),
1412
+ ExtractUserKey(kv[15 * kEntriesPerBlock - 1].first));
1413
+
1115
1414
  iter->Prepare(&scan_options);
1116
- // Match start key
1117
- iter->Seek(kv[10 * kEntriesPerBlock].first);
1118
- for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
1119
- ASSERT_TRUE(iter->Valid());
1120
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1121
- iter->Next();
1415
+ auto* bbiter = dynamic_cast<BlockBasedTableIterator*>(iter.get());
1416
+ ASSERT_TRUE(bbiter);
1417
+ for (int block = 0; block < 15; ++block) {
1418
+ ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
1122
1419
  }
1420
+
1421
+ // MultiScan require seeks to be called in scan_option order
1422
+ iter->Seek(kv[0 * kEntriesPerBlock].first);
1423
+ ASSERT_TRUE(iter->Valid());
1123
1424
  ASSERT_OK(iter->status());
1124
- // Does not match start key of the second ScanOptions.
1125
- iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
1126
- for (size_t i = 50 * kEntriesPerBlock + 1; i < 100 * kEntriesPerBlock; ++i) {
1127
- ASSERT_TRUE(iter->Valid());
1128
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1129
- iter->Next();
1130
- }
1131
- ASSERT_FALSE(iter->Valid());
1425
+
1426
+ // Seek to second range - should unpin blocks from first range
1427
+ iter->Seek(kv[5 * kEntriesPerBlock - 4].first);
1428
+ ASSERT_TRUE(iter->Valid());
1132
1429
  ASSERT_OK(iter->status());
1430
+ ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 4].first);
1431
+ ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 4].second);
1133
1432
 
1134
- iter.reset(table->NewIterator(
1135
- read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
1136
- /*skip_filters=*/false, TableReaderCaller::kUncategorized));
1137
- scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first)),
1138
- ScanOptions(ExtractUserKey(kv[11 * kEntriesPerBlock].first))};
1139
- iter->Prepare(&scan_options);
1140
- // Does not match the first ScanOptions.
1141
- iter->SeekToFirst();
1142
- for (size_t i = 0; i < kEntriesPerBlock; ++i) {
1143
- ASSERT_TRUE(iter->Valid());
1144
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1145
- iter->Next();
1433
+ // The last block (block 4) is shared with the second range, so
1434
+ // it's not unpinned yet.
1435
+ for (int block = 0; block < 4; ++block) {
1436
+ ASSERT_FALSE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
1146
1437
  }
1147
- ASSERT_OK(iter->status());
1148
- iter->Seek(kv[10 * kEntriesPerBlock].first);
1149
- for (size_t i = 10 * kEntriesPerBlock; i < 12 * kEntriesPerBlock; ++i) {
1150
- ASSERT_TRUE(iter->Valid());
1151
- ASSERT_EQ(iter->key().ToString(), kv[i].first);
1152
- iter->Next();
1438
+ // Blocks from second range still in cache.
1439
+ // We skip block 4 here since it's ownership is moved to the actual data
1440
+ // block iter.
1441
+ for (int block = 5; block < 15; ++block) {
1442
+ ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
1153
1443
  }
1444
+
1445
+ iter->Seek(kv[5 * kEntriesPerBlock - 2].first);
1446
+ ASSERT_TRUE(iter->Valid());
1154
1447
  ASSERT_OK(iter->status());
1448
+ ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 2].first);
1449
+ ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 2].second);
1450
+
1451
+ // Still pinned
1452
+ for (int block = 5; block < 15; ++block) {
1453
+ ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
1454
+ }
1155
1455
  }
1156
1456
 
1157
- // Param 1: compression type
1158
- // Param 2: whether to use direct reads
1159
- // Param 3: Block Based Table Index type, partitioned filters are also enabled
1160
- // when index type is kTwoLevelIndexSearch
1161
- // Param 4: BBTO no_block_cache option
1162
- // Param 5: test mode for the user-defined timestamp feature
1163
- // Param 6: number of parallel compression threads
1164
- // Param 7: CompressionOptions.max_dict_bytes and
1165
- // CompressionOptions.max_dict_buffer_bytes. This enable/disables
1166
- // compression dictionary.
1167
- // Param 8: test mode to specify the pattern for generating key / value pairs.
1168
1457
  INSTANTIATE_TEST_CASE_P(
1169
1458
  BlockBasedTableReaderTest, BlockBasedTableReaderTest,
1170
1459
  ::testing::Combine(
@@ -1176,7 +1465,8 @@ INSTANTIATE_TEST_CASE_P(
1176
1465
  BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
1177
1466
  ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
1178
1467
  ::testing::Values(1, 2), ::testing::Values(0, 4096),
1179
- ::testing::Values(false)));
1468
+ ::testing::Values(false),
1469
+ ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
1180
1470
  INSTANTIATE_TEST_CASE_P(
1181
1471
  BlockBasedTableReaderGetTest, BlockBasedTableReaderGetTest,
1182
1472
  ::testing::Combine(
@@ -1188,7 +1478,8 @@ INSTANTIATE_TEST_CASE_P(
1188
1478
  BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
1189
1479
  ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
1190
1480
  ::testing::Values(1, 2), ::testing::Values(0, 4096),
1191
- ::testing::Values(false, true)));
1481
+ ::testing::Values(false, true),
1482
+ ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
1192
1483
  INSTANTIATE_TEST_CASE_P(
1193
1484
  StrictCapacityLimitReaderTest, StrictCapacityLimitReaderTest,
1194
1485
  ::testing::Combine(
@@ -1197,7 +1488,8 @@ INSTANTIATE_TEST_CASE_P(
1197
1488
  BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
1198
1489
  ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
1199
1490
  ::testing::Values(1, 2), ::testing::Values(0),
1200
- ::testing::Values(false, true)));
1491
+ ::testing::Values(false, true),
1492
+ ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
1201
1493
  INSTANTIATE_TEST_CASE_P(
1202
1494
  VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
1203
1495
  ::testing::Combine(
@@ -1206,8 +1498,8 @@ INSTANTIATE_TEST_CASE_P(
1206
1498
  ::testing::Values(
1207
1499
  BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
1208
1500
  ::testing::Values(true), ::testing::ValuesIn(test::GetUDTTestModes()),
1209
- ::testing::Values(1, 2), ::testing::Values(0),
1210
- ::testing::Values(false)));
1501
+ ::testing::Values(1, 2), ::testing::Values(0), ::testing::Values(false),
1502
+ ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
1211
1503
 
1212
1504
  } // namespace ROCKSDB_NAMESPACE
1213
1505