@nxtedition/rocksdb 8.2.7 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -1
  2. package/deps/rocksdb/rocksdb/Makefile +22 -19
  3. package/deps/rocksdb/rocksdb/TARGETS +8 -0
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +157 -61
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +43 -92
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +632 -455
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +244 -149
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +41 -13
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +11 -1
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +216 -17
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +7 -5
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +279 -199
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +159 -8
  15. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +28 -2
  16. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +1 -1
  17. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +8 -0
  18. package/deps/rocksdb/rocksdb/crash_test.mk +14 -0
  19. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -1
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -1
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +1 -1
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +18 -21
  25. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +1 -2
  26. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +1 -1
  27. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +2 -3
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +1 -1
  29. package/deps/rocksdb/rocksdb/db/builder.cc +32 -7
  30. package/deps/rocksdb/rocksdb/db/c.cc +169 -6
  31. package/deps/rocksdb/rocksdb/db/c_test.c +104 -6
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +98 -47
  33. package/deps/rocksdb/rocksdb/db/column_family.h +25 -2
  34. package/deps/rocksdb/rocksdb/db/column_family_test.cc +213 -2
  35. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +93 -23
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +33 -9
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -6
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +2 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +107 -43
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -4
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -0
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +25 -17
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -4
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -11
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +29 -4
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +24 -31
  50. package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +3 -1
  51. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +19 -19
  52. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +2 -1
  53. package/deps/rocksdb/rocksdb/db/convenience.cc +20 -3
  54. package/deps/rocksdb/rocksdb/db/convenience_impl.h +15 -0
  55. package/deps/rocksdb/rocksdb/db/corruption_test.cc +17 -0
  56. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +1 -0
  57. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +17 -3
  58. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +5 -0
  59. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +15 -15
  60. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +666 -44
  61. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +2 -29
  62. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +274 -1
  63. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +40 -19
  64. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +6 -5
  65. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +250 -116
  66. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +51 -23
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +354 -96
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +6 -3
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +2 -1
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +50 -21
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +26 -13
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +13 -5
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +61 -21
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -87
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +7 -1
  77. package/deps/rocksdb/rocksdb/db/db_iter.cc +2 -2
  78. package/deps/rocksdb/rocksdb/db/db_iter.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +4 -11
  80. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +6 -6
  81. package/deps/rocksdb/rocksdb/db/db_options_test.cc +39 -29
  82. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +26 -36
  83. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +106 -0
  84. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +12 -3
  85. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +1 -1
  86. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +1 -0
  87. package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +279 -166
  88. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -21
  89. package/deps/rocksdb/rocksdb/db/db_test2.cc +81 -12
  90. package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
  91. package/deps/rocksdb/rocksdb/db/db_test_util.h +40 -0
  92. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +13 -1
  93. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +233 -0
  94. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +143 -0
  95. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +6 -6
  96. package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -2
  97. package/deps/rocksdb/rocksdb/db/dbformat.cc +36 -0
  98. package/deps/rocksdb/rocksdb/db/dbformat.h +169 -20
  99. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +129 -0
  100. package/deps/rocksdb/rocksdb/db/error_handler.cc +16 -0
  101. package/deps/rocksdb/rocksdb/db/error_handler.h +6 -3
  102. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  103. package/deps/rocksdb/rocksdb/db/event_helpers.cc +4 -0
  104. package/deps/rocksdb/rocksdb/db/experimental.cc +2 -1
  105. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +4 -4
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +17 -8
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -4
  108. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -1
  109. package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -4
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +101 -11
  111. package/deps/rocksdb/rocksdb/db/flush_job.h +24 -1
  112. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +88 -11
  113. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +2 -3
  114. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +159 -91
  115. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +19 -10
  116. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +143 -0
  117. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -1
  118. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  119. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -1
  120. package/deps/rocksdb/rocksdb/db/log_reader.h +3 -2
  121. package/deps/rocksdb/rocksdb/db/log_test.cc +17 -21
  122. package/deps/rocksdb/rocksdb/db/log_writer.cc +1 -1
  123. package/deps/rocksdb/rocksdb/db/log_writer.h +3 -2
  124. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +4 -3
  125. package/deps/rocksdb/rocksdb/db/memtable.cc +52 -13
  126. package/deps/rocksdb/rocksdb/db/memtable.h +45 -1
  127. package/deps/rocksdb/rocksdb/db/memtable_list.cc +44 -10
  128. package/deps/rocksdb/rocksdb/db/memtable_list.h +32 -1
  129. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +90 -4
  130. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -2
  131. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +1 -0
  132. package/deps/rocksdb/rocksdb/db/repair.cc +21 -4
  133. package/deps/rocksdb/rocksdb/db/repair_test.cc +143 -2
  134. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -4
  135. package/deps/rocksdb/rocksdb/db/table_cache.cc +44 -35
  136. package/deps/rocksdb/rocksdb/db/table_cache.h +6 -6
  137. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  138. package/deps/rocksdb/rocksdb/db/version_builder.cc +0 -1
  139. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +236 -204
  140. package/deps/rocksdb/rocksdb/db/version_edit.cc +66 -4
  141. package/deps/rocksdb/rocksdb/db/version_edit.h +48 -6
  142. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +80 -8
  143. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +12 -0
  144. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +86 -17
  145. package/deps/rocksdb/rocksdb/db/version_set.cc +136 -41
  146. package/deps/rocksdb/rocksdb/db/version_set.h +28 -7
  147. package/deps/rocksdb/rocksdb/db/version_set_test.cc +25 -15
  148. package/deps/rocksdb/rocksdb/db/write_batch.cc +11 -0
  149. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +3 -0
  150. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +16 -0
  151. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -3
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +2 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +42 -0
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +32 -3
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -0
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +247 -120
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +9 -4
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +13 -6
  159. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +2 -0
  160. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +15 -27
  161. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +264 -69
  162. package/deps/rocksdb/rocksdb/env/env.cc +1 -2
  163. package/deps/rocksdb/rocksdb/env/env_encryption.cc +11 -165
  164. package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +0 -17
  165. package/deps/rocksdb/rocksdb/env/env_posix.cc +6 -2
  166. package/deps/rocksdb/rocksdb/env/env_test.cc +86 -2
  167. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  168. package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +78 -0
  169. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +34 -0
  170. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -0
  171. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +15 -4
  172. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +52 -43
  173. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +34 -18
  174. package/deps/rocksdb/rocksdb/file/file_util.cc +10 -5
  175. package/deps/rocksdb/rocksdb/file/file_util.h +13 -1
  176. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +724 -79
  177. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +64 -33
  178. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -16
  179. package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +23 -12
  180. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +3 -0
  181. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +2 -1
  182. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +153 -88
  183. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +70 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +50 -11
  185. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -0
  186. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +16 -2
  187. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +1 -1
  188. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +55 -8
  189. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +32 -4
  190. package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -109
  191. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +90 -13
  192. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +3 -0
  193. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +85 -17
  194. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +13 -1
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +2 -1
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +5 -1
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +21 -2
  198. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +7 -1
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +6 -0
  200. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +5 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +33 -2
  202. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +14 -0
  203. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +33 -2
  204. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +0 -3
  205. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  206. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +3 -0
  207. package/deps/rocksdb/rocksdb/memory/arena_test.cc +18 -11
  208. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +2 -1
  209. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +69 -34
  210. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +16 -1
  211. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +10 -0
  212. package/deps/rocksdb/rocksdb/options/cf_options.cc +19 -0
  213. package/deps/rocksdb/rocksdb/options/cf_options.h +10 -2
  214. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -1
  215. package/deps/rocksdb/rocksdb/options/db_options.cc +7 -0
  216. package/deps/rocksdb/rocksdb/options/db_options.h +1 -0
  217. package/deps/rocksdb/rocksdb/options/options.cc +15 -1
  218. package/deps/rocksdb/rocksdb/options/options_helper.cc +6 -0
  219. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -3
  220. package/deps/rocksdb/rocksdb/options/options_test.cc +8 -0
  221. package/deps/rocksdb/rocksdb/port/mmap.h +20 -0
  222. package/deps/rocksdb/rocksdb/port/stack_trace.cc +27 -12
  223. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -1
  224. package/deps/rocksdb/rocksdb/src.mk +3 -0
  225. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  226. package/deps/rocksdb/rocksdb/table/block_based/block.cc +48 -22
  227. package/deps/rocksdb/rocksdb/table/block_based/block.h +60 -12
  228. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +115 -42
  229. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -5
  230. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +60 -2
  231. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +2 -0
  232. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +62 -44
  233. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +36 -14
  234. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -15
  235. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +219 -51
  236. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +41 -8
  237. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -1
  238. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +50 -21
  239. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +11 -4
  240. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +195 -55
  241. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
  242. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +31 -16
  243. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +97 -58
  244. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  245. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +6 -0
  246. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +27 -12
  247. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +3 -1
  248. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +114 -70
  249. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +1 -2
  250. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +9 -6
  251. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +15 -3
  252. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +6 -3
  253. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +11 -11
  254. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -0
  255. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +1 -0
  256. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +6 -2
  257. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +1 -2
  258. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +2 -3
  259. package/deps/rocksdb/rocksdb/table/format.cc +175 -33
  260. package/deps/rocksdb/rocksdb/table/format.h +63 -10
  261. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +10 -2
  262. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +12 -4
  263. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -0
  264. package/deps/rocksdb/rocksdb/table/mock_table.cc +8 -3
  265. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +10 -5
  266. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +10 -1
  267. package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +1 -2
  268. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -3
  269. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +12 -3
  270. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +26 -1
  271. package/deps/rocksdb/rocksdb/table/table_builder.h +6 -2
  272. package/deps/rocksdb/rocksdb/table/table_properties.cc +6 -0
  273. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -22
  274. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +19 -7
  275. package/deps/rocksdb/rocksdb/test_util/sync_point.h +3 -1
  276. package/deps/rocksdb/rocksdb/test_util/testutil.cc +29 -0
  277. package/deps/rocksdb/rocksdb/test_util/testutil.h +19 -0
  278. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +65 -26
  279. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
  280. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -0
  281. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -0
  282. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +0 -1
  283. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +4 -0
  284. package/deps/rocksdb/rocksdb/unreleased_history/README.txt +73 -0
  285. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +27 -0
  286. package/deps/rocksdb/rocksdb/unreleased_history/behavior_changes/.gitkeep +0 -0
  287. package/deps/rocksdb/rocksdb/unreleased_history/bug_fixes/.gitkeep +0 -0
  288. package/deps/rocksdb/rocksdb/unreleased_history/new_features/.gitkeep +0 -0
  289. package/deps/rocksdb/rocksdb/unreleased_history/performance_improvements/.gitkeep +0 -0
  290. package/deps/rocksdb/rocksdb/unreleased_history/public_api_changes/.gitkeep +0 -0
  291. package/deps/rocksdb/rocksdb/unreleased_history/release.sh +104 -0
  292. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +5 -0
  293. package/deps/rocksdb/rocksdb/util/bloom_impl.h +3 -3
  294. package/deps/rocksdb/rocksdb/util/cast_util.h +14 -0
  295. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -0
  296. package/deps/rocksdb/rocksdb/util/comparator.cc +29 -7
  297. package/deps/rocksdb/rocksdb/util/compression.cc +4 -4
  298. package/deps/rocksdb/rocksdb/util/compression.h +110 -32
  299. package/deps/rocksdb/rocksdb/util/core_local.h +2 -1
  300. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +4 -4
  301. package/deps/rocksdb/rocksdb/util/filelock_test.cc +3 -0
  302. package/deps/rocksdb/rocksdb/util/hash.h +7 -3
  303. package/deps/rocksdb/rocksdb/util/hash_test.cc +44 -0
  304. package/deps/rocksdb/rocksdb/util/math.h +58 -6
  305. package/deps/rocksdb/rocksdb/util/math128.h +29 -7
  306. package/deps/rocksdb/rocksdb/util/mutexlock.h +35 -27
  307. package/deps/rocksdb/rocksdb/util/single_thread_executor.h +1 -0
  308. package/deps/rocksdb/rocksdb/util/stop_watch.h +1 -1
  309. package/deps/rocksdb/rocksdb/util/thread_operation.h +8 -1
  310. package/deps/rocksdb/rocksdb/util/udt_util.cc +343 -0
  311. package/deps/rocksdb/rocksdb/util/udt_util.h +173 -1
  312. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +447 -0
  313. package/deps/rocksdb/rocksdb/util/write_batch_util.cc +25 -0
  314. package/deps/rocksdb/rocksdb/util/write_batch_util.h +80 -0
  315. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -4
  316. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +69 -25
  317. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +7 -6
  318. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +1 -1
  319. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +2 -3
  320. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +6 -11
  321. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -2
  322. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +4 -5
  323. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +1 -1
  324. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +2 -2
  325. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +2 -1
  326. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +3 -3
  327. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +1 -2
  328. package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +2 -3
  329. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +2 -2
  330. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
  331. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +23 -8
  332. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +9 -6
  333. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +37 -12
  334. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +231 -33
  335. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +0 -1
  336. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +76 -20
  337. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +18 -9
  338. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +40 -23
  339. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +13 -12
  340. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +7 -0
  341. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -1
  342. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +41 -11
  343. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +6 -3
  344. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +71 -24
  345. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +19 -4
  346. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +60 -107
  347. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +39 -11
  348. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +6 -3
  349. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +14 -8
  350. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h +1 -1
  351. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +10 -5
  352. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  353. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +1 -1
  354. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +2 -1
  355. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +6 -6
  356. package/deps/rocksdb/rocksdb.gyp +2 -0
  357. package/package.json +1 -1
  358. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  359. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -25,10 +25,12 @@ class MockFS;
25
25
  class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
26
26
  public:
27
27
  MockRandomAccessFile(std::unique_ptr<FSRandomAccessFile>& file,
28
- bool support_prefetch, std::atomic_int& prefetch_count)
28
+ bool support_prefetch, std::atomic_int& prefetch_count,
29
+ bool small_buffer_alignment = false)
29
30
  : FSRandomAccessFileOwnerWrapper(std::move(file)),
30
31
  support_prefetch_(support_prefetch),
31
- prefetch_count_(prefetch_count) {}
32
+ prefetch_count_(prefetch_count),
33
+ small_buffer_alignment_(small_buffer_alignment) {}
32
34
 
33
35
  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
34
36
  IODebugContext* dbg) override {
@@ -40,16 +42,25 @@ class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
40
42
  }
41
43
  }
42
44
 
45
+ size_t GetRequiredBufferAlignment() const override {
46
+ return small_buffer_alignment_
47
+ ? 1
48
+ : FSRandomAccessFileOwnerWrapper::GetRequiredBufferAlignment();
49
+ }
50
+
43
51
  private:
44
52
  const bool support_prefetch_;
45
53
  std::atomic_int& prefetch_count_;
54
+ const bool small_buffer_alignment_;
46
55
  };
47
56
 
48
57
  class MockFS : public FileSystemWrapper {
49
58
  public:
50
59
  explicit MockFS(const std::shared_ptr<FileSystem>& wrapped,
51
- bool support_prefetch)
52
- : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {}
60
+ bool support_prefetch, bool small_buffer_alignment = false)
61
+ : FileSystemWrapper(wrapped),
62
+ support_prefetch_(support_prefetch),
63
+ small_buffer_alignment_(small_buffer_alignment) {}
53
64
 
54
65
  static const char* kClassName() { return "MockFS"; }
55
66
  const char* Name() const override { return kClassName(); }
@@ -61,8 +72,8 @@ class MockFS : public FileSystemWrapper {
61
72
  std::unique_ptr<FSRandomAccessFile> file;
62
73
  IOStatus s;
63
74
  s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
64
- result->reset(
65
- new MockRandomAccessFile(file, support_prefetch_, prefetch_count_));
75
+ result->reset(new MockRandomAccessFile(
76
+ file, support_prefetch_, prefetch_count_, small_buffer_alignment_));
66
77
  return s;
67
78
  }
68
79
 
@@ -76,6 +87,7 @@ class MockFS : public FileSystemWrapper {
76
87
 
77
88
  private:
78
89
  const bool support_prefetch_;
90
+ const bool small_buffer_alignment_;
79
91
  std::atomic_int prefetch_count_{0};
80
92
  };
81
93
 
@@ -85,7 +97,8 @@ class PrefetchTest
85
97
  public:
86
98
  PrefetchTest() : DBTestBase("prefetch_test", true) {}
87
99
 
88
- void SetGenericOptions(Env* env, bool use_direct_io, Options& options) {
100
+ virtual void SetGenericOptions(Env* env, bool use_direct_io,
101
+ Options& options) {
89
102
  options = CurrentOptions();
90
103
  options.write_buffer_size = 1024;
91
104
  options.create_if_missing = true;
@@ -115,7 +128,14 @@ std::string BuildKey(int num, std::string postfix = "") {
115
128
  return "my_key_" + std::to_string(num) + postfix;
116
129
  }
117
130
 
118
- // This test verifies the basic functionality of prefetching.
131
+ // This test verifies the following basic functionalities of prefetching:
132
+ // (1) If underline file system supports prefetch, and directIO is not enabled
133
+ // make sure prefetch() is called and FilePrefetchBuffer is not used.
134
+ // (2) If underline file system doesn't support prefetch, or directIO is
135
+ // enabled, make sure prefetch() is not called and FilePrefetchBuffer is
136
+ // used.
137
+ // (3) Measure read bytes, hit and miss of SST's tail prefetching during table
138
+ // open.
119
139
  TEST_P(PrefetchTest, Basic) {
120
140
  // First param is if the mockFS support_prefetch or not
121
141
  bool support_prefetch =
@@ -152,6 +172,7 @@ TEST_P(PrefetchTest, Basic) {
152
172
  ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key"));
153
173
  }
154
174
  ASSERT_OK(db_->Write(WriteOptions(), &batch));
175
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
155
176
 
156
177
  // create second key range
157
178
  batch.Clear();
@@ -159,6 +180,7 @@ TEST_P(PrefetchTest, Basic) {
159
180
  ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key"));
160
181
  }
161
182
  ASSERT_OK(db_->Write(WriteOptions(), &batch));
183
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
162
184
 
163
185
  // delete second key range
164
186
  batch.Clear();
@@ -166,6 +188,20 @@ TEST_P(PrefetchTest, Basic) {
166
188
  ASSERT_OK(batch.Delete(BuildKey(i, "key2")));
167
189
  }
168
190
  ASSERT_OK(db_->Write(WriteOptions(), &batch));
191
+ ASSERT_OK(db_->Flush(FlushOptions()));
192
+
193
+ // To verify SST file tail prefetch (once per file) during flush output
194
+ // verification
195
+ if (support_prefetch && !use_direct_io) {
196
+ ASSERT_TRUE(fs->IsPrefetchCalled());
197
+ ASSERT_EQ(3, fs->GetPrefetchCount());
198
+ ASSERT_EQ(0, buff_prefetch_count);
199
+ fs->ClearPrefetchCount();
200
+ } else {
201
+ ASSERT_FALSE(fs->IsPrefetchCalled());
202
+ ASSERT_EQ(buff_prefetch_count, 3);
203
+ buff_prefetch_count = 0;
204
+ }
169
205
 
170
206
  // compact database
171
207
  std::string start_key = BuildKey(0);
@@ -192,25 +228,27 @@ TEST_P(PrefetchTest, Basic) {
192
228
  const uint64_t cur_table_open_prefetch_tail_hit =
193
229
  options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
194
230
 
231
+ // To verify prefetch during compaction input read
195
232
  if (support_prefetch && !use_direct_io) {
196
- // If underline file system supports prefetch, and directIO is not enabled
197
- // make sure prefetch() is called and FilePrefetchBuffer is not used.
198
233
  ASSERT_TRUE(fs->IsPrefetchCalled());
199
- fs->ClearPrefetchCount();
234
+ // To rule out false positive by the SST file tail prefetch during
235
+ // compaction output verification
236
+ ASSERT_GT(fs->GetPrefetchCount(), 1);
200
237
  ASSERT_EQ(0, buff_prefetch_count);
238
+ fs->ClearPrefetchCount();
201
239
  } else {
202
- // If underline file system doesn't support prefetch, or directIO is
203
- // enabled, make sure prefetch() is not called and FilePrefetchBuffer is
204
- // used.
205
240
  ASSERT_FALSE(fs->IsPrefetchCalled());
206
- ASSERT_GT(buff_prefetch_count, 0);
241
+ // To rule out false positive by the SST file tail prefetch during
242
+ // compaction output verification
243
+ ASSERT_GT(buff_prefetch_count, 1);
244
+ buff_prefetch_count = 0;
245
+
207
246
  ASSERT_GT(cur_table_open_prefetch_tail_read.count,
208
247
  prev_table_open_prefetch_tail_read.count);
209
248
  ASSERT_GT(cur_table_open_prefetch_tail_hit,
210
249
  prev_table_open_prefetch_tail_hit);
211
250
  ASSERT_GE(cur_table_open_prefetch_tail_miss,
212
251
  prev_table_open_prefetch_tail_miss);
213
- buff_prefetch_count = 0;
214
252
  }
215
253
 
216
254
  // count the keys
@@ -223,7 +261,7 @@ TEST_P(PrefetchTest, Basic) {
223
261
  (void)num_keys;
224
262
  }
225
263
 
226
- // Make sure prefetch is called only if file system support prefetch.
264
+ // To verify prefetch during user scan
227
265
  if (support_prefetch && !use_direct_io) {
228
266
  ASSERT_TRUE(fs->IsPrefetchCalled());
229
267
  fs->ClearPrefetchCount();
@@ -236,30 +274,79 @@ TEST_P(PrefetchTest, Basic) {
236
274
  Close();
237
275
  }
238
276
 
239
- TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) {
240
- const bool support_prefetch =
241
- std::get<0>(GetParam()) &&
242
- test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
243
- // Second param is if directIO is enabled or not
244
- const bool use_direct_io = std::get<1>(GetParam());
245
- const bool use_file_prefetch_buffer = !support_prefetch || use_direct_io;
277
+ class PrefetchTailTest : public PrefetchTest {
278
+ public:
279
+ bool SupportPrefetch() const {
280
+ return std::get<0>(GetParam()) &&
281
+ test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
282
+ }
246
283
 
247
- std::shared_ptr<MockFS> fs =
248
- std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
249
- std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
284
+ bool UseDirectIO() const { return std::get<1>(GetParam()); }
285
+
286
+ bool UseFilePrefetchBuffer() const {
287
+ return !SupportPrefetch() || UseDirectIO();
288
+ }
289
+
290
+ Env* GetEnv(bool small_buffer_alignment = false) const {
291
+ std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
292
+ env_->GetFileSystem(), SupportPrefetch(), small_buffer_alignment);
293
+
294
+ return new CompositeEnvWrapper(env_, fs);
295
+ }
296
+
297
+ void SetGenericOptions(Env* env, bool use_direct_io,
298
+ Options& options) override {
299
+ PrefetchTest::SetGenericOptions(env, use_direct_io, options);
300
+ options.statistics = CreateDBStatistics();
301
+ }
250
302
 
303
+ void SetBlockBasedTableOptions(
304
+ BlockBasedTableOptions& table_options, bool partition_filters = true,
305
+ uint64_t metadata_block_size =
306
+ BlockBasedTableOptions().metadata_block_size,
307
+ bool use_small_cache = false) {
308
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
309
+ table_options.partition_filters = partition_filters;
310
+ if (table_options.partition_filters) {
311
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
312
+ }
313
+ table_options.metadata_block_size = metadata_block_size;
314
+
315
+ if (use_small_cache) {
316
+ LRUCacheOptions co;
317
+ co.capacity = 1;
318
+ std::shared_ptr<Cache> cache = NewLRUCache(co);
319
+ table_options.block_cache = cache;
320
+ }
321
+ }
322
+
323
+ int64_t GetNumIndexPartition() const {
324
+ int64_t index_partition_counts = 0;
325
+ TablePropertiesCollection all_table_props;
326
+ assert(db_->GetPropertiesOfAllTables(&all_table_props).ok());
327
+ for (const auto& name_and_table_props : all_table_props) {
328
+ const auto& table_props = name_and_table_props.second;
329
+ index_partition_counts += table_props->index_partitions;
330
+ }
331
+ return index_partition_counts;
332
+ }
333
+ };
334
+
335
+ INSTANTIATE_TEST_CASE_P(PrefetchTailTest, PrefetchTailTest,
336
+ ::testing::Combine(::testing::Bool(),
337
+ ::testing::Bool()));
338
+
339
+ TEST_P(PrefetchTailTest, Basic) {
340
+ std::unique_ptr<Env> env(GetEnv());
251
341
  Options options;
252
- SetGenericOptions(env.get(), use_direct_io, options);
253
- options.statistics = CreateDBStatistics();
342
+ SetGenericOptions(env.get(), UseDirectIO(), options);
254
343
 
255
344
  BlockBasedTableOptions bbto;
256
- bbto.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
257
- bbto.partition_filters = true;
258
- bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
345
+ SetBlockBasedTableOptions(bbto);
259
346
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
260
347
 
261
348
  Status s = TryReopen(options);
262
- if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
349
+ if (UseDirectIO() && (s.IsNotSupported() || s.IsInvalidArgument())) {
263
350
  // If direct IO is not supported, skip the test
264
351
  ROCKSDB_GTEST_BYPASS("Direct IO is not supported");
265
352
  return;
@@ -276,7 +363,7 @@ TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) {
276
363
  HistogramData post_flush_file_read;
277
364
  options.statistics->histogramData(FILE_READ_FLUSH_MICROS,
278
365
  &post_flush_file_read);
279
- if (use_file_prefetch_buffer) {
366
+ if (UseFilePrefetchBuffer()) {
280
367
  // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()`
281
368
  // should read from the prefetched tail in file prefetch buffer instead of
282
369
  // initiating extra SST reads. Therefore `BlockBasedTable::PrefetchTail()`
@@ -300,7 +387,7 @@ TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) {
300
387
  HistogramData post_compaction_file_read;
301
388
  options.statistics->histogramData(FILE_READ_COMPACTION_MICROS,
302
389
  &post_compaction_file_read);
303
- if (use_file_prefetch_buffer) {
390
+ if (UseFilePrefetchBuffer()) {
304
391
  // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()`
305
392
  // should read from the prefetched tail in file prefetch buffer instead of
306
393
  // initiating extra SST reads.
@@ -323,6 +410,85 @@ TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) {
323
410
  Close();
324
411
  }
325
412
 
413
+ TEST_P(PrefetchTailTest, UpgradeToTailSizeInManifest) {
414
+ if (!UseFilePrefetchBuffer()) {
415
+ ROCKSDB_GTEST_BYPASS(
416
+ "Upgrade to tail size in manifest is only relevant when RocksDB file "
417
+ "prefetch buffer is used.");
418
+ }
419
+ if (UseDirectIO()) {
420
+ ROCKSDB_GTEST_BYPASS(
421
+ "To simplify testing logics with setting file's buffer alignment to "
422
+ "be "
423
+ "1, direct IO is required to be disabled.");
424
+ }
425
+
426
+ std::unique_ptr<Env> env(GetEnv(true /* small_buffer_alignment */));
427
+ Options options;
428
+ SetGenericOptions(env.get(), false /* use_direct_io*/, options);
429
+ options.max_open_files = -1;
430
+ options.write_buffer_size = 1024 * 1024;
431
+
432
+ BlockBasedTableOptions table_options;
433
+ SetBlockBasedTableOptions(table_options, false /* partition_filters */,
434
+ 1 /* metadata_block_size*/,
435
+ true /* use_small_cache */);
436
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
437
+
438
+ SyncPoint::GetInstance()->EnableProcessing();
439
+ // To simulate a pre-upgrade DB where file tail size is not recorded in
440
+ // manifest
441
+ SyncPoint::GetInstance()->SetCallBack(
442
+ "FileMetaData::FileMetaData", [&](void* arg) {
443
+ FileMetaData* meta = static_cast<FileMetaData*>(arg);
444
+ meta->tail_size = 0;
445
+ });
446
+
447
+ ASSERT_OK(TryReopen(options));
448
+ for (int i = 0; i < 10000; ++i) {
449
+ ASSERT_OK(Put("k" + std::to_string(i), "v"));
450
+ }
451
+ ASSERT_OK(Flush());
452
+
453
+ SyncPoint::GetInstance()->ClearAllCallBacks();
454
+
455
+ // To simulate a DB undergoing the upgrade where tail size to prefetch is
456
+ // inferred to be a small number for files with no tail size recorded in
457
+ // manifest.
458
+ // "1" is chosen to be such number so that with `small_buffer_alignment ==
459
+ // true` and `use_small_cache == true`, it would have caused one file read
460
+ // per index partition during db open if the upgrade is done wrong.
461
+ SyncPoint::GetInstance()->SetCallBack(
462
+ "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
463
+ std::pair<size_t*, size_t*>* prefetch_off_len_pair =
464
+ static_cast<std::pair<size_t*, size_t*>*>(arg);
465
+ size_t* prefetch_off = prefetch_off_len_pair->first;
466
+ size_t* tail_size = prefetch_off_len_pair->second;
467
+ const size_t file_size = *prefetch_off + *tail_size;
468
+
469
+ *tail_size = 1;
470
+ *prefetch_off = file_size - (*tail_size);
471
+ });
472
+
473
+ ASSERT_OK(TryReopen(options));
474
+
475
+ SyncPoint::GetInstance()->ClearAllCallBacks();
476
+ SyncPoint::GetInstance()->DisableProcessing();
477
+
478
+ HistogramData db_open_file_read;
479
+ options.statistics->histogramData(FILE_READ_DB_OPEN_MICROS,
480
+ &db_open_file_read);
481
+
482
+ int64_t num_index_partition = GetNumIndexPartition();
483
+ // If the upgrade is done right, db open will prefetch all the index
484
+ // partitions at once, instead of doing one read per partition.
485
+ // That is, together with `metadata_block_size == 1`, there will be more
486
+ // index partitions than number of non index partitions reads.
487
+ ASSERT_LT(db_open_file_read.count, num_index_partition);
488
+
489
+ Close();
490
+ }
491
+
326
492
  // This test verifies BlockBasedTableOptions.max_auto_readahead_size is
327
493
  // configured dynamically.
328
494
  TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
@@ -385,7 +551,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
385
551
  }
386
552
  Close();
387
553
  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
388
- TryReopen(options);
554
+ ASSERT_OK(TryReopen(options));
389
555
  {
390
556
  auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
391
557
  fs->ClearPrefetchCount();
@@ -513,7 +679,7 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
513
679
  }
514
680
  Close();
515
681
 
516
- TryReopen(options);
682
+ ASSERT_OK(TryReopen(options));
517
683
  {
518
684
  auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
519
685
  fs->ClearPrefetchCount();
@@ -530,8 +696,8 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
530
696
  "{initial_auto_readahead_size=0;}"}}));
531
697
  break;
532
698
  case 1:
533
- // intial_auto_readahead_size and max_auto_readahead_size are set same
534
- // so readahead_size remains same.
699
+ // intial_auto_readahead_size and max_auto_readahead_size are set
700
+ // same so readahead_size remains same.
535
701
  ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
536
702
  "{initial_auto_readahead_size=4096;max_"
537
703
  "auto_readahead_size=4096;}"}}));
@@ -628,7 +794,7 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) {
628
794
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
629
795
 
630
796
  Close();
631
- TryReopen(options);
797
+ ASSERT_OK(TryReopen(options));
632
798
 
633
799
  fs->ClearPrefetchCount();
634
800
  buff_prefetch_count = 0;
@@ -638,8 +804,9 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) {
638
804
  /*
639
805
  * Reseek keys from sequential Data Blocks within same partitioned
640
806
  * index. It will prefetch the data block at the first seek since
641
- * num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076 so
642
- * readahead will fetch 8 * 1024 data more initially (2 more data blocks).
807
+ * num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076
808
+ * so readahead will fetch 8 * 1024 data more initially (2 more data
809
+ * blocks).
643
810
  */
644
811
  iter->Seek(BuildKey(0)); // Prefetch data + index block since
645
812
  // num_file_reads_for_auto_readahead = 0.
@@ -737,8 +904,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
737
904
  /*
738
905
  * Reseek keys from sequential Data Blocks within same partitioned
739
906
  * index. After 2 sequential reads it will prefetch the data block.
740
- * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
741
- * initially (2 more data blocks).
907
+ * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
908
+ * more initially (2 more data blocks).
742
909
  */
743
910
  iter->Seek(BuildKey(0));
744
911
  ASSERT_TRUE(iter->Valid());
@@ -815,9 +982,9 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
815
982
  {
816
983
  /*
817
984
  * Reseek keys from sequential data blocks to set implicit auto readahead
818
- * and prefetch data but after that iterate over different (non sequential)
819
- * data blocks which won't prefetch any data further. So buff_prefetch_count
820
- * will be 1 for the first one.
985
+ * and prefetch data but after that iterate over different (non
986
+ * sequential) data blocks which won't prefetch any data further. So
987
+ * buff_prefetch_count will be 1 for the first one.
821
988
  */
822
989
  auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
823
990
  iter->Seek(BuildKey(0));
@@ -844,8 +1011,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
844
1011
  buff_prefetch_count = 0;
845
1012
  }
846
1013
 
847
- // Read sequentially to confirm readahead_size is reset to initial value (2
848
- // more data blocks)
1014
+ // Read sequentially to confirm readahead_size is reset to initial value
1015
+ // (2 more data blocks)
849
1016
  iter->Seek(BuildKey(1011));
850
1017
  ASSERT_TRUE(iter->Valid());
851
1018
  iter->Seek(BuildKey(1015));
@@ -895,8 +1062,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
895
1062
  }
896
1063
  {
897
1064
  /*
898
- * Reseek over different keys from different blocks. buff_prefetch_count is
899
- * set 0.
1065
+ * Reseek over different keys from different blocks. buff_prefetch_count
1066
+ * is set 0.
900
1067
  */
901
1068
  auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
902
1069
  int i = 0;
@@ -1000,8 +1167,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
1000
1167
  /*
1001
1168
  * Reseek keys from sequential Data Blocks within same partitioned
1002
1169
  * index. After 2 sequential reads it will prefetch the data block.
1003
- * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
1004
- * initially (2 more data blocks).
1170
+ * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
1171
+ * more initially (2 more data blocks).
1005
1172
  */
1006
1173
  auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
1007
1174
  // Warm up the cache
@@ -1028,8 +1195,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
1028
1195
  ASSERT_TRUE(iter->Valid());
1029
1196
  iter->Seek(BuildKey(1004)); // Prefetch data (not in cache).
1030
1197
  ASSERT_TRUE(iter->Valid());
1031
- // Missed one sequential block but next is in already in buffer so readahead
1032
- // will not be reset.
1198
+ // Missed one sequential block but next is in already in buffer so
1199
+ // readahead will not be reset.
1033
1200
  iter->Seek(BuildKey(1011));
1034
1201
  ASSERT_TRUE(iter->Valid());
1035
1202
  // Prefetch data but blocks are in cache so no prefetch and reset.
@@ -1164,10 +1331,14 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) {
1164
1331
  // This test verifies the functionality of ReadOptions.adaptive_readahead when
1165
1332
  // async_io is enabled.
1166
1333
  TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) {
1334
+ if (mem_env_ || encrypted_env_) {
1335
+ ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment");
1336
+ return;
1337
+ }
1167
1338
  const int kNumKeys = 1000;
1168
1339
  // Set options
1169
1340
  std::shared_ptr<MockFS> fs =
1170
- std::make_shared<MockFS>(env_->GetFileSystem(), false);
1341
+ std::make_shared<MockFS>(FileSystem::Default(), false);
1171
1342
  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
1172
1343
 
1173
1344
  bool use_direct_io = std::get<0>(GetParam());
@@ -1201,16 +1372,26 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) {
1201
1372
  }
1202
1373
  MoveFilesToLevel(2);
1203
1374
  int buff_async_prefetch_count = 0;
1375
+ int buff_prefetch_count = 0;
1204
1376
  int readahead_carry_over_count = 0;
1205
1377
  int num_sst_files = NumTableFilesAtLevel(2);
1206
1378
  size_t current_readahead_size = 0;
1379
+ bool read_async_called = false;
1207
1380
 
1208
1381
  // Test - Iterate over the keys sequentially.
1209
1382
  {
1383
+ SyncPoint::GetInstance()->SetCallBack(
1384
+ "FilePrefetchBuffer::Prefetch:Start",
1385
+ [&](void*) { buff_prefetch_count++; });
1386
+
1210
1387
  SyncPoint::GetInstance()->SetCallBack(
1211
1388
  "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
1212
1389
  [&](void*) { buff_async_prefetch_count++; });
1213
1390
 
1391
+ SyncPoint::GetInstance()->SetCallBack(
1392
+ "UpdateResults::io_uring_result",
1393
+ [&](void* /*arg*/) { read_async_called = true; });
1394
+
1214
1395
  // The callback checks, since reads are sequential, readahead_size doesn't
1215
1396
  // start from 8KB when iterator moves to next file and its called
1216
1397
  // num_sst_files-1 times (excluding for first file).
@@ -1253,15 +1434,18 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) {
1253
1434
  } else {
1254
1435
  ASSERT_EQ(readahead_carry_over_count, 0);
1255
1436
  }
1256
- ASSERT_GT(buff_async_prefetch_count, 0);
1257
1437
 
1258
1438
  // Check stats to make sure async prefetch is done.
1259
1439
  {
1260
1440
  HistogramData async_read_bytes;
1261
1441
  options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1262
- if (ro.async_io) {
1442
+ // Not all platforms support iouring. In that case, ReadAsync in posix
1443
+ // won't submit async requests.
1444
+ if (read_async_called) {
1445
+ ASSERT_GT(buff_async_prefetch_count, 0);
1263
1446
  ASSERT_GT(async_read_bytes.count, 0);
1264
1447
  } else {
1448
+ ASSERT_GT(buff_prefetch_count, 0);
1265
1449
  ASSERT_EQ(async_read_bytes.count, 0);
1266
1450
  }
1267
1451
  }
@@ -1294,6 +1478,7 @@ TEST_P(PrefetchTest, DBIterAsyncIONoIOUring) {
1294
1478
  Status s = TryReopen(options);
1295
1479
  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
1296
1480
  // If direct IO is not supported, skip the test
1481
+ enable_io_uring = true;
1297
1482
  return;
1298
1483
  } else {
1299
1484
  ASSERT_OK(s);
@@ -1375,7 +1560,8 @@ class PrefetchTest1 : public DBTestBase,
1375
1560
  public:
1376
1561
  PrefetchTest1() : DBTestBase("prefetch_test1", true) {}
1377
1562
 
1378
- void SetGenericOptions(Env* env, bool use_direct_io, Options& options) {
1563
+ virtual void SetGenericOptions(Env* env, bool use_direct_io,
1564
+ Options& options) {
1379
1565
  options = CurrentOptions();
1380
1566
  options.write_buffer_size = 1024;
1381
1567
  options.create_if_missing = true;
@@ -1399,6 +1585,106 @@ class PrefetchTest1 : public DBTestBase,
1399
1585
 
1400
1586
  INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, ::testing::Bool());
1401
1587
 
1588
+ TEST_P(PrefetchTest1, SeekWithExtraPrefetchAsyncIO) {
1589
+ const int kNumKeys = 2000;
1590
+ // Set options
1591
+ std::shared_ptr<MockFS> fs =
1592
+ std::make_shared<MockFS>(env_->GetFileSystem(), false);
1593
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
1594
+
1595
+ Options options;
1596
+ SetGenericOptions(env.get(), GetParam(), options);
1597
+ options.statistics = CreateDBStatistics();
1598
+ BlockBasedTableOptions table_options;
1599
+ SetBlockBasedTableOptions(table_options);
1600
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1601
+
1602
+ Status s = TryReopen(options);
1603
+ if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
1604
+ // If direct IO is not supported, skip the test
1605
+ return;
1606
+ } else {
1607
+ ASSERT_OK(s);
1608
+ }
1609
+
1610
+ WriteBatch batch;
1611
+ Random rnd(309);
1612
+ for (int i = 0; i < kNumKeys; i++) {
1613
+ ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
1614
+ }
1615
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1616
+
1617
+ std::string start_key = BuildKey(0);
1618
+ std::string end_key = BuildKey(kNumKeys - 1);
1619
+ Slice least(start_key.data(), start_key.size());
1620
+ Slice greatest(end_key.data(), end_key.size());
1621
+
1622
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
1623
+ Close();
1624
+
1625
+ for (size_t i = 0; i < 3; i++) {
1626
+ table_options.num_file_reads_for_auto_readahead = i;
1627
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1628
+
1629
+ s = TryReopen(options);
1630
+ ASSERT_OK(s);
1631
+
1632
+ int buff_prefetch_count = 0;
1633
+ int extra_prefetch_buff_cnt = 0;
1634
+ SyncPoint::GetInstance()->SetCallBack(
1635
+ "FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching",
1636
+ [&](void*) { extra_prefetch_buff_cnt++; });
1637
+
1638
+ SyncPoint::GetInstance()->SetCallBack(
1639
+ "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
1640
+ [&](void*) { buff_prefetch_count++; });
1641
+
1642
+ SyncPoint::GetInstance()->EnableProcessing();
1643
+
1644
+ ReadOptions ro;
1645
+ ro.async_io = true;
1646
+ {
1647
+ auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
1648
+ // First Seek
1649
+ iter->Seek(BuildKey(
1650
+ 0)); // Prefetch data on seek because of seek parallelization.
1651
+ ASSERT_TRUE(iter->Valid());
1652
+
1653
+ // Do extra prefetching in Seek only if
1654
+ // num_file_reads_for_auto_readahead = 0.
1655
+ ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0));
1656
+ // buff_prefetch_count is 2 because of index block when
1657
+ // num_file_reads_for_auto_readahead = 0.
1658
+ // If num_file_reads_for_auto_readahead > 0, index block isn't
1659
+ // prefetched.
1660
+ ASSERT_EQ(buff_prefetch_count, i == 0 ? 2 : 1);
1661
+
1662
+ extra_prefetch_buff_cnt = 0;
1663
+ buff_prefetch_count = 0;
1664
+ // Reset all values of FilePrefetchBuffer on new seek.
1665
+ iter->Seek(
1666
+ BuildKey(22)); // Prefetch data because of seek parallelization.
1667
+ ASSERT_TRUE(iter->Valid());
1668
+ // Do extra prefetching in Seek only if
1669
+ // num_file_reads_for_auto_readahead = 0.
1670
+ ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0));
1671
+ ASSERT_EQ(buff_prefetch_count, 1);
1672
+
1673
+ extra_prefetch_buff_cnt = 0;
1674
+ buff_prefetch_count = 0;
1675
+ // Reset all values of FilePrefetchBuffer on new seek.
1676
+ iter->Seek(
1677
+ BuildKey(33)); // Prefetch data because of seek parallelization.
1678
+ ASSERT_TRUE(iter->Valid());
1679
+ // Do extra prefetching in Seek only if
1680
+ // num_file_reads_for_auto_readahead = 0.
1681
+ ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0));
1682
+ ASSERT_EQ(buff_prefetch_count, 1);
1683
+ }
1684
+ Close();
1685
+ }
1686
+ }
1687
+
1402
1688
  // This test verifies the functionality of ReadOptions.adaptive_readahead when
1403
1689
  // reads are not sequential.
1404
1690
  TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) {
@@ -1482,8 +1768,8 @@ TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) {
1482
1768
  Close();
1483
1769
  }
1484
1770
 
1485
- // This test verifies the functionality of adaptive_readaheadsize with cache and
1486
- // if block is found in cache, decrease the readahead_size if
1771
+ // This test verifies the functionality of adaptive_readaheadsize with cache
1772
+ // and if block is found in cache, decrease the readahead_size if
1487
1773
  // - its enabled internally by RocksDB (implicit_auto_readahead_) and,
1488
1774
  // - readahead_size is greater than 0 and,
1489
1775
  // - the block would have called prefetch API if not found in cache for
@@ -1605,8 +1891,8 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) {
1605
1891
  ASSERT_TRUE(iter->Valid());
1606
1892
 
1607
1893
  // Prefetch data (not in buffer) but found in cache. So decrease
1608
- // readahead_size. Since it will 0 after decrementing so readahead_size will
1609
- // be set to initial value.
1894
+ // readahead_size. Since it will 0 after decrementing so readahead_size
1895
+ // will be set to initial value.
1610
1896
  iter->Seek(BuildKey(1019));
1611
1897
  ASSERT_TRUE(iter->Valid());
1612
1898
  expected_current_readahead_size = std::max(
@@ -1629,10 +1915,14 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) {
1629
1915
  // This test verifies the basic functionality of seek parallelization for
1630
1916
  // async_io.
1631
1917
  TEST_P(PrefetchTest1, SeekParallelizationTest) {
1918
+ if (mem_env_ || encrypted_env_) {
1919
+ ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment");
1920
+ return;
1921
+ }
1632
1922
  const int kNumKeys = 2000;
1633
1923
  // Set options
1634
- std::shared_ptr<MockFS> fs =
1635
- std::make_shared<MockFS>(env_->GetFileSystem(), false);
1924
+ std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
1925
+ FileSystem::Default(), /*support_prefetch=*/false);
1636
1926
  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
1637
1927
 
1638
1928
  Options options;
@@ -1665,10 +1955,19 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) {
1665
1955
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
1666
1956
 
1667
1957
  int buff_prefetch_count = 0;
1958
+ int buff_prefetch_async_count = 0;
1668
1959
 
1669
1960
  SyncPoint::GetInstance()->SetCallBack(
1670
1961
  "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
1671
- [&](void*) { buff_prefetch_count++; });
1962
+ [&](void*) { buff_prefetch_async_count++; });
1963
+
1964
+ SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
1965
+ [&](void*) { buff_prefetch_count++; });
1966
+
1967
+ bool read_async_called = false;
1968
+ SyncPoint::GetInstance()->SetCallBack(
1969
+ "UpdateResults::io_uring_result",
1970
+ [&](void* /*arg*/) { read_async_called = true; });
1672
1971
 
1673
1972
  SyncPoint::GetInstance()->EnableProcessing();
1674
1973
  ReadOptions ro;
@@ -1703,17 +2002,276 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) {
1703
2002
  iter->Next();
1704
2003
  ASSERT_TRUE(iter->Valid());
1705
2004
 
1706
- ASSERT_EQ(buff_prefetch_count, 2);
1707
-
1708
- // Check stats to make sure async prefetch is done.
1709
- {
1710
- HistogramData async_read_bytes;
1711
- options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
2005
+ HistogramData async_read_bytes;
2006
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
2007
+ // not all platforms support io_uring. In that case it'll fallback to
2008
+ // normal prefetching without async_io.
2009
+ if (read_async_called) {
2010
+ ASSERT_EQ(buff_prefetch_async_count, 2);
1712
2011
  ASSERT_GT(async_read_bytes.count, 0);
1713
2012
  ASSERT_GT(get_perf_context()->number_async_seek, 0);
2013
+ } else {
2014
+ ASSERT_EQ(buff_prefetch_count, 1);
2015
+ }
2016
+ }
2017
+ Close();
2018
+ }
2019
+
2020
+ // This test checks if readahead_size is trimmed when upper_bound is reached.
2021
+ // It tests with different combinations of async_io disabled/enabled,
2022
+ // readahead_size (implicit and explicit), and num_file_reads_for_auto_readahead
2023
+ // from 0 to 2.
2024
+ TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBound) {
2025
+ if (mem_env_ || encrypted_env_) {
2026
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
2027
+ return;
2028
+ }
2029
+
2030
+ // First param is if the mockFS support_prefetch or not
2031
+ std::shared_ptr<MockFS> fs =
2032
+ std::make_shared<MockFS>(FileSystem::Default(), false);
2033
+
2034
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
2035
+ Options options;
2036
+ SetGenericOptions(env.get(), /*use_direct_io=*/false, options);
2037
+ options.statistics = CreateDBStatistics();
2038
+ BlockBasedTableOptions table_options;
2039
+ SetBlockBasedTableOptions(table_options);
2040
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2041
+
2042
+ Status s = TryReopen(options);
2043
+ ASSERT_OK(s);
2044
+
2045
+ Random rnd(309);
2046
+ WriteBatch batch;
2047
+
2048
+ for (int i = 0; i < 26; i++) {
2049
+ std::string key = "my_key_";
2050
+
2051
+ for (int j = 0; j < 10; j++) {
2052
+ key += char('a' + i);
2053
+ ASSERT_OK(batch.Put(key, rnd.RandomString(1000)));
1714
2054
  }
2055
+ }
2056
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1715
2057
 
2058
+ std::string start_key = "my_key_a";
2059
+
2060
+ std::string end_key = "my_key_";
2061
+ for (int j = 0; j < 10; j++) {
2062
+ end_key += char('a' + 25);
2063
+ }
2064
+
2065
+ Slice least(start_key.data(), start_key.size());
2066
+ Slice greatest(end_key.data(), end_key.size());
2067
+
2068
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
2069
+
2070
+ int buff_prefetch_count = 0;
2071
+
2072
+ // Try with different num_file_reads_for_auto_readahead from 0 to 3.
2073
+ for (size_t i = 0; i < 3; i++) {
2074
+ table_options.num_file_reads_for_auto_readahead = i;
2075
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2076
+
2077
+ s = TryReopen(options);
2078
+ ASSERT_OK(s);
2079
+
2080
+ int buff_count_with_tuning = 0, buff_count_without_tuning = 0;
2081
+ int keys_with_tuning = 0, keys_without_tuning = 0;
1716
2082
  buff_prefetch_count = 0;
2083
+
2084
+ SyncPoint::GetInstance()->SetCallBack(
2085
+ "FilePrefetchBuffer::Prefetch:Start",
2086
+ [&](void*) { buff_prefetch_count++; });
2087
+
2088
+ SyncPoint::GetInstance()->SetCallBack(
2089
+ "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
2090
+ [&](void*) { buff_prefetch_count++; });
2091
+
2092
+ SyncPoint::GetInstance()->EnableProcessing();
2093
+
2094
+ ReadOptions ropts;
2095
+ if (std::get<0>(GetParam())) {
2096
+ ropts.readahead_size = 32768;
2097
+ }
2098
+ if (std::get<1>(GetParam())) {
2099
+ ropts.async_io = true;
2100
+ }
2101
+
2102
+ Slice ub = Slice("my_key_uuu");
2103
+ ropts.iterate_upper_bound = &ub;
2104
+ Slice seek_key = Slice("my_key_aaa");
2105
+
2106
+ // With tuning readahead_size.
2107
+ {
2108
+ ASSERT_OK(options.statistics->Reset());
2109
+ ropts.auto_readahead_size = true;
2110
+
2111
+ auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ropts));
2112
+
2113
+ iter->Seek(seek_key);
2114
+
2115
+ while (iter->Valid()) {
2116
+ keys_with_tuning++;
2117
+ iter->Next();
2118
+ }
2119
+
2120
+ uint64_t readhahead_trimmed =
2121
+ options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED);
2122
+ ASSERT_GT(readhahead_trimmed, 0);
2123
+ buff_count_with_tuning = buff_prefetch_count;
2124
+ }
2125
+
2126
+ // Without tuning readahead_size
2127
+ {
2128
+ buff_prefetch_count = 0;
2129
+ ASSERT_OK(options.statistics->Reset());
2130
+ ropts.auto_readahead_size = false;
2131
+
2132
+ auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ropts));
2133
+
2134
+ iter->Seek(seek_key);
2135
+
2136
+ while (iter->Valid()) {
2137
+ keys_without_tuning++;
2138
+ iter->Next();
2139
+ }
2140
+ buff_count_without_tuning = buff_prefetch_count;
2141
+ uint64_t readhahead_trimmed =
2142
+ options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED);
2143
+ ASSERT_EQ(readhahead_trimmed, 0);
2144
+ }
2145
+
2146
+ {
2147
+ // Verify results with and without tuning.
2148
+ if (std::get<1>(GetParam())) {
2149
+ // In case of async_io.
2150
+ ASSERT_GE(buff_count_with_tuning, buff_count_without_tuning);
2151
+ } else {
2152
+ ASSERT_EQ(buff_count_without_tuning, buff_count_with_tuning);
2153
+ }
2154
+ // Prefetching should happen.
2155
+ ASSERT_GT(buff_count_without_tuning, 0);
2156
+ ASSERT_GT(buff_count_with_tuning, 0);
2157
+ // No of keys should be equal.
2158
+ ASSERT_EQ(keys_without_tuning, keys_with_tuning);
2159
+ }
2160
+ Close();
2161
+ }
2162
+ }
2163
+
2164
+ // This test checks if readahead_size is trimmed when upper_bound is reached
2165
+ // during Seek in async_io and it goes for polling without any extra
2166
+ // prefetching.
2167
+ TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBoundSeekOnly) {
2168
+ if (mem_env_ || encrypted_env_) {
2169
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
2170
+ return;
2171
+ }
2172
+
2173
+ // First param is if the mockFS support_prefetch or not
2174
+ std::shared_ptr<MockFS> fs =
2175
+ std::make_shared<MockFS>(FileSystem::Default(), false);
2176
+
2177
+ bool use_direct_io = false;
2178
+ if (std::get<0>(GetParam())) {
2179
+ use_direct_io = true;
2180
+ }
2181
+
2182
+ std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
2183
+ Options options;
2184
+ SetGenericOptions(env.get(), use_direct_io, options);
2185
+ options.statistics = CreateDBStatistics();
2186
+ BlockBasedTableOptions table_options;
2187
+ SetBlockBasedTableOptions(table_options);
2188
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2189
+
2190
+ Status s = TryReopen(options);
2191
+ if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
2192
+ // If direct IO is not supported, skip the test
2193
+ return;
2194
+ } else {
2195
+ ASSERT_OK(s);
2196
+ }
2197
+
2198
+ Random rnd(309);
2199
+ WriteBatch batch;
2200
+
2201
+ for (int i = 0; i < 26; i++) {
2202
+ std::string key = "my_key_";
2203
+
2204
+ for (int j = 0; j < 10; j++) {
2205
+ key += char('a' + i);
2206
+ ASSERT_OK(batch.Put(key, rnd.RandomString(1000)));
2207
+ }
2208
+ }
2209
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
2210
+
2211
+ std::string start_key = "my_key_a";
2212
+
2213
+ std::string end_key = "my_key_";
2214
+ for (int j = 0; j < 10; j++) {
2215
+ end_key += char('a' + 25);
2216
+ }
2217
+
2218
+ Slice least(start_key.data(), start_key.size());
2219
+ Slice greatest(end_key.data(), end_key.size());
2220
+
2221
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
2222
+
2223
+ s = TryReopen(options);
2224
+ ASSERT_OK(s);
2225
+
2226
+ int buff_count_with_tuning = 0;
2227
+
2228
+ SyncPoint::GetInstance()->SetCallBack(
2229
+ "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
2230
+ [&](void*) { buff_count_with_tuning++; });
2231
+
2232
+ bool read_async_called = false;
2233
+ SyncPoint::GetInstance()->SetCallBack(
2234
+ "UpdateResults::io_uring_result",
2235
+ [&](void* /*arg*/) { read_async_called = true; });
2236
+
2237
+ SyncPoint::GetInstance()->EnableProcessing();
2238
+
2239
+ SyncPoint::GetInstance()->EnableProcessing();
2240
+
2241
+ ReadOptions ropts;
2242
+ if (std::get<1>(GetParam())) {
2243
+ ropts.readahead_size = 32768;
2244
+ }
2245
+ ropts.async_io = true;
2246
+
2247
+ Slice ub = Slice("my_key_aaa");
2248
+ ropts.iterate_upper_bound = &ub;
2249
+ Slice seek_key = Slice("my_key_aaa");
2250
+
2251
+ // With tuning readahead_size.
2252
+ {
2253
+ ASSERT_OK(options.statistics->Reset());
2254
+ ropts.auto_readahead_size = true;
2255
+
2256
+ auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ropts));
2257
+
2258
+ iter->Seek(seek_key);
2259
+
2260
+ ASSERT_OK(iter->status());
2261
+
2262
+ // Verify results.
2263
+ uint64_t readhahead_trimmed =
2264
+ options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED);
2265
+ // Readahead got trimmed.
2266
+ if (read_async_called) {
2267
+ ASSERT_GT(readhahead_trimmed, 0);
2268
+ // Seek called PrefetchAsync to poll the data.
2269
+ ASSERT_EQ(1, buff_count_with_tuning);
2270
+ } else {
2271
+ // async_io disabled.
2272
+ ASSERT_GE(readhahead_trimmed, 0);
2273
+ ASSERT_EQ(0, buff_count_with_tuning);
2274
+ }
1717
2275
  }
1718
2276
  Close();
1719
2277
  }
@@ -2294,7 +2852,7 @@ TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) {
2294
2852
  std::unique_ptr<RandomAccessFileReader> r;
2295
2853
  Read(fname, opts, &r);
2296
2854
 
2297
- FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, fs());
2855
+ FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, 0, fs());
2298
2856
  Slice result;
2299
2857
  // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
2300
2858
  // it will do two reads of 4096+8192 and 8192
@@ -2310,8 +2868,93 @@ TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) {
2310
2868
  fpb.UpdateReadPattern(0, 4096, false);
2311
2869
  // Now read some data that straddles the two prefetch buffers - offset 8192 to
2312
2870
  // 16384
2313
- ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), 8192, 8192,
2314
- &result, &s, Env::IOPriority::IO_LOW));
2871
+ IOOptions io_opts;
2872
+ io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
2873
+ ASSERT_TRUE(
2874
+ fpb.TryReadFromCacheAsync(io_opts, r.get(), 8192, 8192, &result, &s));
2875
+ }
2876
+
2877
+ // Test to ensure when PrefetchAsync is called during seek, it doesn't do any
2878
+ // alignment or prefetch extra if readahead is not enabled during seek.
2879
+ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) {
2880
+ std::string fname = "seek-wwithout-alignment";
2881
+ Random rand(0);
2882
+ std::string content = rand.RandomString(32768);
2883
+ Write(fname, content);
2884
+
2885
+ FileOptions opts;
2886
+ std::unique_ptr<RandomAccessFileReader> r;
2887
+ Read(fname, opts, &r);
2888
+
2889
+ size_t alignment = r->file()->GetRequiredBufferAlignment();
2890
+ size_t n = alignment / 2;
2891
+
2892
+ int read_async_called = 0;
2893
+ SyncPoint::GetInstance()->SetCallBack(
2894
+ "FilePrefetchBuffer::ReadAsync",
2895
+ [&](void* /*arg*/) { read_async_called++; });
2896
+ SyncPoint::GetInstance()->EnableProcessing();
2897
+
2898
+ // Without readahead enabled, there will be no alignment and offset of buffer
2899
+ // will be n.
2900
+ {
2901
+ FilePrefetchBuffer fpb(
2902
+ /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true,
2903
+ /*track_min_offset=*/false, /*implicit_auto_readahead=*/true,
2904
+ /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2,
2905
+ /*upper_bound_offset=*/0, fs());
2906
+
2907
+ Slice result;
2908
+ // Simulate a seek of half of alignment bytes at offset n. Due to the
2909
+ // readahead settings, it won't prefetch extra or do any alignment and
2910
+ // offset of buffer will be n.
2911
+ Status s = fpb.PrefetchAsync(IOOptions(), r.get(), n, n, &result);
2912
+
2913
+ // Platforms that don't have IO uring may not support async IO.
2914
+ if (s.IsNotSupported()) {
2915
+ return;
2916
+ }
2917
+
2918
+ ASSERT_TRUE(s.IsTryAgain());
2919
+
2920
+ IOOptions io_opts;
2921
+ io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
2922
+ ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), n, n, &result, &s));
2923
+
2924
+ if (read_async_called) {
2925
+ ASSERT_EQ(fpb.GetPrefetchOffset(), n);
2926
+ }
2927
+ }
2928
+
2929
+ // With readahead enabled, it will do the alignment and prefetch and offset of
2930
+ // buffer will be 0.
2931
+ {
2932
+ read_async_called = false;
2933
+ FilePrefetchBuffer fpb(
2934
+ /*readahead_size=*/16384, /*max_readahead_size=*/16384, /*enable=*/true,
2935
+ /*track_min_offset=*/false, /*implicit_auto_readahead=*/false,
2936
+ /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2,
2937
+ /*upper_bound_offset=*/0, fs());
2938
+
2939
+ Slice result;
2940
+ // Simulate a seek of half of alignment bytes at offset n.
2941
+ Status s = fpb.PrefetchAsync(IOOptions(), r.get(), n, n, &result);
2942
+
2943
+ // Platforms that don't have IO uring may not support async IO.
2944
+ if (s.IsNotSupported()) {
2945
+ return;
2946
+ }
2947
+
2948
+ ASSERT_TRUE(s.IsTryAgain());
2949
+
2950
+ IOOptions io_opts;
2951
+ io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
2952
+ ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), n, n, &result, &s));
2953
+
2954
+ if (read_async_called) {
2955
+ ASSERT_EQ(fpb.GetPrefetchOffset(), 0);
2956
+ }
2957
+ }
2315
2958
  }
2316
2959
 
2317
2960
  TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) {
@@ -2327,7 +2970,8 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) {
2327
2970
  FilePrefetchBuffer fpb(
2328
2971
  /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true,
2329
2972
  /*track_min_offset=*/false, /*implicit_auto_readahead=*/false,
2330
- /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, fs());
2973
+ /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0,
2974
+ /*upper_bound_offset=*/0, fs());
2331
2975
 
2332
2976
  int read_async_called = 0;
2333
2977
  SyncPoint::GetInstance()->SetCallBack(
@@ -2346,9 +2990,10 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) {
2346
2990
  }
2347
2991
 
2348
2992
  ASSERT_TRUE(s.IsTryAgain());
2349
- ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), /*offset=*/3000,
2350
- /*length=*/4000, &async_result, &s,
2351
- Env::IOPriority::IO_LOW));
2993
+ IOOptions io_opts;
2994
+ io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
2995
+ ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), /*offset=*/3000,
2996
+ /*length=*/4000, &async_result, &s));
2352
2997
  // No sync call should be made.
2353
2998
  HistogramData sst_read_micros;
2354
2999
  stats()->histogramData(SST_READ_MICROS, &sst_read_micros);