@nxtedition/rocksdb 13.1.4 → 13.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/binding.cc +43 -16
  2. package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
  7. package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
  8. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
  9. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
  10. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
  11. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  12. package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
  13. package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
  14. package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
  15. package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
  16. package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
  39. package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
  40. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
  54. package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
  55. package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
  56. package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
  57. package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
  58. package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
  59. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
  60. package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
  61. package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
  62. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  63. package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
  65. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
  66. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
  67. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
  68. package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
  69. package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
  70. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
  71. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
  72. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
  73. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
  75. package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
  76. package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
  77. package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
  78. package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
  79. package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
  80. package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
  81. package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
  82. package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
  83. package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
  84. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
  85. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
  86. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
  87. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
  88. package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
  89. package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
  90. package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
  91. package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
  92. package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
  93. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
  94. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  95. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
  96. package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
  97. package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
  98. package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
  99. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
  100. package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
  101. package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
  102. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
  103. package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
  104. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
  105. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
  106. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
  107. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
  108. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
  109. package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
  110. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
  111. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
  113. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
  114. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
  115. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
  118. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
  119. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
  120. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
  121. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
  122. package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
  123. package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
  124. package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
  125. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  126. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
  127. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
  128. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
  129. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
  130. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
  131. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
  132. package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
  134. package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
  135. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
  136. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
  137. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
  139. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
  140. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
  142. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
  143. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
  144. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
  145. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
  147. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
  148. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
  149. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
  150. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
  151. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
  152. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  153. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
  154. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
  155. package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
  156. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
  157. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
  158. package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
  159. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
  160. package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
  161. package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
  162. package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
  163. package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
  164. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
  165. package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
  166. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
  167. package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
  168. package/deps/rocksdb/rocksdb/port/port.h +5 -9
  169. package/deps/rocksdb/rocksdb/src.mk +8 -0
  170. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
  171. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
  172. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
  173. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
  174. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
  175. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
  176. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
  177. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
  178. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
  179. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
  180. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
  181. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
  182. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  183. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
  184. package/deps/rocksdb/rocksdb/table/format.cc +3 -3
  185. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
  186. package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
  187. package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
  188. package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
  189. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  190. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
  191. package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
  192. package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
  193. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
  194. package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
  195. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
  196. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
  197. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
  198. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
  199. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
  200. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
  201. package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
  202. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
  203. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
  204. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
  205. package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
  206. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
  207. package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
  208. package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
  209. package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
  210. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
  211. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
  212. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
  213. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
  214. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
  215. package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
  216. package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
  217. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
  218. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
  219. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
  220. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
  221. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
  222. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
  223. package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
  224. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
  225. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
  226. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
  227. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
  228. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
  229. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
  230. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
  231. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
  232. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
  233. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
  234. package/deps/rocksdb/rocksdb.gyp +2 -0
  235. package/package.json +1 -1
  236. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  237. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -51,6 +51,7 @@
51
51
  #include "env/io_posix.h"
52
52
  #include "monitoring/iostats_context_imp.h"
53
53
  #include "monitoring/thread_status_updater.h"
54
+ #include "options/db_options.h"
54
55
  #include "port/lang.h"
55
56
  #include "port/port.h"
56
57
  #include "rocksdb/options.h"
@@ -930,6 +931,28 @@ class PosixFileSystem : public FileSystem {
930
931
  optimized.fallocate_with_keep_size = true;
931
932
  return optimized;
932
933
  }
934
+
935
+ FileOptions OptimizeForCompactionTableRead(
936
+ const FileOptions& file_options,
937
+ const ImmutableDBOptions& db_options) const override {
938
+ FileOptions fo = FileOptions(file_options);
939
+ #ifdef OS_LINUX
940
+ // To fix https://github.com/facebook/rocksdb/issues/12038
941
+ if (!file_options.use_direct_reads &&
942
+ file_options.compaction_readahead_size > 0) {
943
+ size_t system_limit =
944
+ GetCompactionReadaheadSizeSystemLimit(db_options.db_paths);
945
+ if (system_limit > 0 &&
946
+ file_options.compaction_readahead_size > system_limit) {
947
+ fo.compaction_readahead_size = system_limit;
948
+ }
949
+ }
950
+ #else
951
+ (void)db_options;
952
+ #endif
953
+ return fo;
954
+ }
955
+
933
956
  #ifdef OS_LINUX
934
957
  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
935
958
  return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
@@ -942,6 +965,36 @@ class PosixFileSystem : public FileSystem {
942
965
  private:
943
966
  bool forceMmapOff_ = false; // do we override Env options?
944
967
 
968
+ #ifdef OS_LINUX
969
+ // Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
970
+ // can issue to block devices under a directory, also known as
971
+ // "max_sectors_kb" ) among `db_paths`.
972
+ // Return 0 if no limit can be found or there is an error in
973
+ // retrieving such limit.
974
+ static size_t GetCompactionReadaheadSizeSystemLimit(
975
+ const std::vector<DbPath>& db_paths) {
976
+ Status s;
977
+ size_t limit_kb = 0;
978
+
979
+ for (const auto& db_path : db_paths) {
980
+ size_t dir_max_sectors_kb = 0;
981
+ s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path,
982
+ &dir_max_sectors_kb);
983
+ if (!s.ok()) {
984
+ break;
985
+ }
986
+
987
+ limit_kb = (limit_kb == 0) ? dir_max_sectors_kb
988
+ : std::min(limit_kb, dir_max_sectors_kb);
989
+ }
990
+
991
+ if (s.ok()) {
992
+ return limit_kb * 1024;
993
+ } else {
994
+ return 0;
995
+ }
996
+ }
997
+ #endif
945
998
  // Returns true iff the named directory exists and is a directory.
946
999
  virtual bool DirExists(const std::string& dname) {
947
1000
  struct stat statbuf;
@@ -28,7 +28,7 @@
28
28
  #include <cstdio>
29
29
  #include <cstdlib>
30
30
  #include <cstring>
31
- #ifdef OS_LINUX
31
+ #if defined(OS_LINUX) || defined(OS_ANDROID)
32
32
  #include <sys/statfs.h>
33
33
  #include <sys/sysmacros.h>
34
34
  #endif
@@ -455,38 +455,71 @@ size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
455
455
 
456
456
  Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
457
457
  size_t* size) {
458
+ return GetQueueSysfsFileValueofDirectory(directory,
459
+ GetLogicalBlockSizeFileName(), size);
460
+ }
461
+
462
+ Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory,
463
+ size_t* kb) {
464
+ return GetQueueSysfsFileValueofDirectory(directory, GetMaxSectorsKBFileName(),
465
+ kb);
466
+ }
467
+
468
+ Status PosixHelper::GetQueueSysfsFileValueofDirectory(
469
+ const std::string& directory, const std::string& file_name, size_t* value) {
458
470
  int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
459
471
  if (fd == -1) {
460
472
  return Status::IOError("Cannot open directory " + directory);
461
473
  }
462
- *size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
474
+ if (file_name == PosixHelper::GetLogicalBlockSizeFileName()) {
475
+ *value = PosixHelper::GetLogicalBlockSizeOfFd(fd);
476
+ } else if (file_name == PosixHelper::GetMaxSectorsKBFileName()) {
477
+ *value = PosixHelper::GetMaxSectorsKBOfFd(fd);
478
+ } else {
479
+ assert(false);
480
+ }
463
481
  close(fd);
464
482
  return Status::OK();
465
483
  }
466
484
 
467
485
  size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
486
+ return GetQueueSysfsFileValueOfFd(fd, GetLogicalBlockSizeFileName(),
487
+ kDefaultPageSize);
488
+ }
489
+
490
+ size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) {
491
+ return GetQueueSysfsFileValueOfFd(fd, GetMaxSectorsKBFileName(),
492
+ kDefaultMaxSectorsKB);
493
+ }
494
+
495
+ size_t PosixHelper::GetQueueSysfsFileValueOfFd(
496
+ int fd, const std::string& file_name, const size_t default_return_value) {
468
497
  #ifdef OS_LINUX
469
498
  struct stat buf;
470
499
  int result = fstat(fd, &buf);
471
500
  if (result == -1) {
472
- return kDefaultPageSize;
501
+ return default_return_value;
473
502
  }
503
+
504
+ // Get device number
474
505
  if (major(buf.st_dev) == 0) {
475
506
  // Unnamed devices (e.g. non-device mounts), reserved as null device number.
476
507
  // These don't have an entry in /sys/dev/block/. Return a sensible default.
477
- return kDefaultPageSize;
508
+ return default_return_value;
478
509
  }
479
510
 
480
- // Reading queue/logical_block_size does not require special permissions.
511
+ // Get device path
481
512
  const int kBufferSize = 100;
482
513
  char path[kBufferSize];
483
514
  char real_path[PATH_MAX + 1];
484
515
  snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
485
516
  minor(buf.st_dev));
486
517
  if (realpath(path, real_path) == nullptr) {
487
- return kDefaultPageSize;
518
+ return default_return_value;
488
519
  }
489
520
  std::string device_dir(real_path);
521
+
522
+ // Get the queue sysfs file path
490
523
  if (!device_dir.empty() && device_dir.back() == '/') {
491
524
  device_dir.pop_back();
492
525
  }
@@ -500,11 +533,11 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
500
533
  // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
501
534
  size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
502
535
  if (parent_end == std::string::npos) {
503
- return kDefaultPageSize;
536
+ return default_return_value;
504
537
  }
505
538
  size_t parent_begin = device_dir.rfind('/', parent_end - 1);
506
539
  if (parent_begin == std::string::npos) {
507
- return kDefaultPageSize;
540
+ return default_return_value;
508
541
  }
509
542
  std::string parent =
510
543
  device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
@@ -513,25 +546,37 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
513
546
  (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
514
547
  device_dir = device_dir.substr(0, parent_end);
515
548
  }
516
- std::string fname = device_dir + "/queue/logical_block_size";
549
+ std::string fname = device_dir + "/queue/" + file_name;
550
+
551
+ // Get value in the queue sysfs file
517
552
  FILE* fp;
518
- size_t size = 0;
553
+ size_t value = 0;
519
554
  fp = fopen(fname.c_str(), "r");
520
555
  if (fp != nullptr) {
521
556
  char* line = nullptr;
522
557
  size_t len = 0;
523
558
  if (getline(&line, &len, fp) != -1) {
524
- sscanf(line, "%zu", &size);
559
+ sscanf(line, "%zu", &value);
525
560
  }
526
561
  free(line);
527
562
  fclose(fp);
528
563
  }
529
- if (size != 0 && (size & (size - 1)) == 0) {
530
- return size;
564
+
565
+ if (file_name == GetLogicalBlockSizeFileName()) {
566
+ if (value != 0 && (value & (value - 1)) == 0) {
567
+ return value;
568
+ }
569
+ } else if (file_name == GetMaxSectorsKBFileName()) {
570
+ if (value != 0) {
571
+ return value;
572
+ }
573
+ } else {
574
+ assert(false);
531
575
  }
532
576
  #endif
533
577
  (void)fd;
534
- return kDefaultPageSize;
578
+ (void)file_name;
579
+ return default_return_value;
535
580
  }
536
581
 
537
582
  /*
@@ -1376,9 +1421,10 @@ IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
1376
1421
  // After ftruncate, we check whether ftruncate has the correct behavior.
1377
1422
  // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
1378
1423
  if (result == 0 &&
1379
- (file_stats.st_size + file_stats.st_blksize - 1) /
1380
- file_stats.st_blksize !=
1381
- file_stats.st_blocks / (file_stats.st_blksize / 512)) {
1424
+ static_cast<size_t>((file_stats.st_size + file_stats.st_blksize - 1) /
1425
+ file_stats.st_blksize) !=
1426
+ static_cast<size_t>(file_stats.st_blocks /
1427
+ (file_stats.st_blksize / 512))) {
1382
1428
  IOSTATS_TIMER_GUARD(allocate_nanos);
1383
1429
  if (allow_fallocate_) {
1384
1430
  fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
@@ -30,7 +30,7 @@
30
30
  // For non linux platform, the following macros are used only as place
31
31
  // holder.
32
32
  #if !(defined OS_LINUX) && !(defined OS_FREEBSD) && !(defined CYGWIN) && \
33
- !(defined OS_AIX)
33
+ !(defined OS_AIX) && !(defined OS_ANDROID)
34
34
  #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
35
35
  #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
36
36
  #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
@@ -53,10 +53,39 @@ IOStatus IOError(const std::string& context, const std::string& file_name,
53
53
 
54
54
  class PosixHelper {
55
55
  public:
56
+ static const std::string& GetLogicalBlockSizeFileName() {
57
+ static const std::string kLogicalBlockSizeFileName = "logical_block_size";
58
+ return kLogicalBlockSizeFileName;
59
+ }
60
+ static const std::string& GetMaxSectorsKBFileName() {
61
+ static const std::string kMaxSectorsKBFileName = "max_sectors_kb";
62
+ return kMaxSectorsKBFileName;
63
+ }
56
64
  static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
57
65
  static size_t GetLogicalBlockSizeOfFd(int fd);
58
66
  static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
59
67
  size_t* size);
68
+
69
+ static Status GetMaxSectorsKBOfDirectory(const std::string& directory,
70
+ size_t* kb);
71
+
72
+ private:
73
+ static const size_t kDefaultMaxSectorsKB = 2 * 1024;
74
+
75
+ static size_t GetMaxSectorsKBOfFd(int fd);
76
+
77
+ // Return the value in the specified `file_name` under
78
+ // `/sys/block/xxx/queue/` for the device where the file of `fd` is on.
79
+ // If not found, then return the specified `default_return_value`
80
+ static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name,
81
+ size_t default_return_value);
82
+
83
+ /// Return the value in the specified `file_name` under
84
+ // `/sys/block/xxx/queue/` for the device where `directory` is on.
85
+ // If not found, then return the specified `default_return_value`
86
+ static Status GetQueueSysfsFileValueofDirectory(const std::string& directory,
87
+ const std::string& file_name,
88
+ size_t* value);
60
89
  };
61
90
 
62
91
  /*
@@ -22,11 +22,9 @@
22
22
 
23
23
  namespace ROCKSDB_NAMESPACE {
24
24
 
25
- void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
26
- uint64_t offset,
27
- size_t roundup_len,
28
- bool refit_tail,
29
- uint64_t& aligned_useful_len) {
25
+ void FilePrefetchBuffer::PrepareBufferForRead(
26
+ BufferInfo* buf, size_t alignment, uint64_t offset, size_t roundup_len,
27
+ bool refit_tail, bool use_fs_buffer, uint64_t& aligned_useful_len) {
30
28
  uint64_t aligned_useful_offset_in_buf = 0;
31
29
  bool copy_data_to_new_buffer = false;
32
30
  // Check if requested bytes are in the existing buffer_.
@@ -39,6 +37,9 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
39
37
  // new buffer is created.
40
38
  aligned_useful_offset_in_buf =
41
39
  Rounddown(static_cast<size_t>(offset - buf->offset_), alignment);
40
+ // aligned_useful_len is passed by reference and used to calculate how much
41
+ // data needs to be read, so it is needed regardless of whether
42
+ // use_fs_buffer is true
42
43
  aligned_useful_len = static_cast<uint64_t>(buf->CurrentSize()) -
43
44
  aligned_useful_offset_in_buf;
44
45
  assert(aligned_useful_offset_in_buf % alignment == 0);
@@ -53,6 +54,16 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
53
54
  }
54
55
  }
55
56
 
57
+ // The later buffer allocation / tail refitting does not apply when
58
+ // use_fs_buffer is true. If we allocate a new buffer, we end up throwing it
59
+ // away later when we reuse the file system allocated buffer. If we refit
60
+ // the tail in the main buffer, we don't have a place to put the next chunk of
61
+ // data provided by the file system (without performing another copy, which we
62
+ // are trying to avoid in the first place)
63
+ if (use_fs_buffer) {
64
+ return;
65
+ }
66
+
56
67
  // Create a new buffer only if current capacity is not sufficient, and memcopy
57
68
  // bytes from old buffer if needed (i.e., if aligned_useful_len is greater
58
69
  // than 0).
@@ -62,8 +73,8 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
62
73
  static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
63
74
  aligned_useful_offset_in_buf, static_cast<size_t>(aligned_useful_len));
64
75
  } else if (aligned_useful_len > 0 && refit_tail) {
65
- // New buffer not needed. But memmove bytes from tail to the beginning since
66
- // aligned_useful_len is greater than 0.
76
+ // New buffer not needed. But memmove bytes from tail to the beginning
77
+ // since aligned_useful_len is greater than 0.
67
78
  buf->buffer_.RefitTail(static_cast<size_t>(aligned_useful_offset_in_buf),
68
79
  static_cast<size_t>(aligned_useful_len));
69
80
  } else if (aligned_useful_len > 0) {
@@ -82,11 +93,19 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
82
93
  Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
83
94
  RandomAccessFileReader* reader,
84
95
  uint64_t read_len, uint64_t aligned_useful_len,
85
- uint64_t start_offset) {
96
+ uint64_t start_offset, bool use_fs_buffer) {
86
97
  Slice result;
87
- char* to_buf = buf->buffer_.BufferStart() + aligned_useful_len;
88
- Status s = reader->Read(opts, start_offset + aligned_useful_len, read_len,
89
- &result, to_buf, /*aligned_buf=*/nullptr);
98
+ Status s;
99
+ char* to_buf = nullptr;
100
+ if (use_fs_buffer) {
101
+ s = FSBufferDirectRead(reader, buf, opts, start_offset + aligned_useful_len,
102
+ read_len, result);
103
+ } else {
104
+ to_buf = buf->buffer_.BufferStart() + aligned_useful_len;
105
+ s = reader->Read(opts, start_offset + aligned_useful_len, read_len, &result,
106
+ to_buf, /*aligned_buf=*/nullptr);
107
+ }
108
+
90
109
  #ifndef NDEBUG
91
110
  if (result.size() < read_len) {
92
111
  // Fake an IO error to force db_stress fault injection to ignore
@@ -97,7 +116,7 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
97
116
  if (!s.ok()) {
98
117
  return s;
99
118
  }
100
- if (result.data() != to_buf) {
119
+ if (!use_fs_buffer && result.data() != to_buf) {
101
120
  // If the read is coming from some other buffer already in memory (such as
102
121
  // mmap) then it would be inefficient to create another copy in this
103
122
  // FilePrefetchBuffer. The caller is expected to exclude this case.
@@ -108,8 +127,11 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
108
127
  if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
109
128
  RecordTick(stats_, PREFETCH_BYTES, read_len);
110
129
  }
111
- // Update the buffer size.
112
- buf->buffer_.Size(static_cast<size_t>(aligned_useful_len) + result.size());
130
+ if (!use_fs_buffer) {
131
+ // Update the buffer size.
132
+ // We already explicitly set the buffer size when we reuse the FS buffer
133
+ buf->buffer_.Size(static_cast<size_t>(aligned_useful_len) + result.size());
134
+ }
113
135
  return s;
114
136
  }
115
137
 
@@ -157,33 +179,42 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
157
179
  return Status::OK();
158
180
  }
159
181
 
160
- size_t alignment = reader->file()->GetRequiredBufferAlignment();
182
+ size_t alignment = GetRequiredBufferAlignment(reader);
161
183
  uint64_t rounddown_offset = offset, roundup_end = 0, aligned_useful_len = 0;
162
184
  size_t read_len = 0;
185
+ // TODO: Enable file system buffer reuse optimization. Need to incorporate
186
+ // overlap buffer logic here (similar to what is done in PrefetchInternal).
187
+ // Currently, if we attempt to use the optimization, it results in an
188
+ // unsigned integer overflow because the returned buffer's offset ends up
189
+ // higher than the requested offset.
190
+ bool use_fs_buffer = false;
163
191
 
164
192
  ReadAheadSizeTuning(buf, /*read_curr_block=*/true,
165
- /*refit_tail=*/true, rounddown_offset, alignment, 0, n,
166
- rounddown_offset, roundup_end, read_len,
193
+ /*refit_tail=*/true, use_fs_buffer, rounddown_offset,
194
+ alignment, 0, n, rounddown_offset, roundup_end, read_len,
167
195
  aligned_useful_len);
168
196
 
169
197
  Status s;
170
198
  if (read_len > 0) {
171
- s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset);
199
+ s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset,
200
+ use_fs_buffer);
172
201
  }
173
202
 
174
203
  if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) {
175
204
  RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len);
176
205
  }
206
+ assert(buf->offset_ <= offset);
177
207
  return s;
178
208
  }
179
209
 
180
210
  // Copy data from src to overlap_buf_.
181
- void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
182
- size_t& length) {
211
+ void FilePrefetchBuffer::CopyDataToOverlapBuffer(BufferInfo* src,
212
+ uint64_t& offset,
213
+ size_t& length) {
183
214
  if (length == 0) {
184
215
  return;
185
216
  }
186
-
217
+ assert(src->IsOffsetInBuffer(offset));
187
218
  uint64_t copy_offset = (offset - src->offset_);
188
219
  size_t copy_len = 0;
189
220
  if (src->IsDataBlockInBuffer(offset, length)) {
@@ -194,10 +225,8 @@ void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
194
225
  }
195
226
 
196
227
  BufferInfo* dst = overlap_buf_;
197
- memcpy(dst->buffer_.BufferStart() + dst->CurrentSize(),
198
- src->buffer_.BufferStart() + copy_offset, copy_len);
199
-
200
- dst->buffer_.Size(dst->CurrentSize() + copy_len);
228
+ assert(copy_len <= dst->buffer_.Capacity() - dst->buffer_.CurrentSize());
229
+ dst->buffer_.Append(src->buffer_.BufferStart() + copy_offset, copy_len);
201
230
 
202
231
  // Update offset and length.
203
232
  offset += copy_len;
@@ -208,6 +237,7 @@ void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
208
237
  if (length > 0) {
209
238
  FreeFrontBuffer();
210
239
  }
240
+ TEST_SYNC_POINT("FilePrefetchBuffer::CopyDataToOverlapBuffer:Complete");
211
241
  }
212
242
 
213
243
  // Clear the buffers if it contains outdated data. Outdated data can be because
@@ -355,7 +385,7 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
355
385
  // of ReadAsync to make sure it doesn't read anything from
356
386
  // previous buffer which is already prefetched.
357
387
  void FilePrefetchBuffer::ReadAheadSizeTuning(
358
- BufferInfo* buf, bool read_curr_block, bool refit_tail,
388
+ BufferInfo* buf, bool read_curr_block, bool refit_tail, bool use_fs_buffer,
359
389
  uint64_t prev_buf_end_offset, size_t alignment, size_t length,
360
390
  size_t readahead_size, uint64_t& start_offset, uint64_t& end_offset,
361
391
  size_t& read_len, uint64_t& aligned_useful_len) {
@@ -408,7 +438,7 @@ void FilePrefetchBuffer::ReadAheadSizeTuning(
408
438
  uint64_t roundup_len = end_offset - start_offset;
409
439
 
410
440
  PrepareBufferForRead(buf, alignment, start_offset, roundup_len, refit_tail,
411
- aligned_useful_len);
441
+ use_fs_buffer, aligned_useful_len);
412
442
  assert(roundup_len >= aligned_useful_len);
413
443
 
414
444
  // Update the buffer offset.
@@ -422,11 +452,43 @@ void FilePrefetchBuffer::ReadAheadSizeTuning(
422
452
  (end_offset - start_offset));
423
453
  }
424
454
 
455
+ // This is for when num_buffers_ = 1.
456
+ // If we are reusing the file system allocated buffer, and only some of the
457
+ // requested data is in the buffer, we copy the relevant data to overlap_buf_
458
+ void FilePrefetchBuffer::HandleOverlappingSyncData(uint64_t offset,
459
+ size_t length,
460
+ uint64_t& tmp_offset,
461
+ size_t& tmp_length,
462
+ bool& use_overlap_buffer) {
463
+ if (IsBufferQueueEmpty()) {
464
+ return;
465
+ }
466
+ BufferInfo* buf = GetFirstBuffer();
467
+ // We should only be calling this when num_buffers_ = 1, so there should
468
+ // not be any async reads.
469
+ assert(!buf->async_read_in_progress_);
470
+
471
+ if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() &&
472
+ buf->IsOffsetInBuffer(offset) &&
473
+ buf->offset_ + buf->CurrentSize() < offset + length) {
474
+ // Allocated overlap_buf_ is just enough to hold the result for the user
475
+ // Alignment does not matter here
476
+ use_overlap_buffer = true;
477
+ overlap_buf_->ClearBuffer();
478
+ overlap_buf_->buffer_.Alignment(1);
479
+ overlap_buf_->buffer_.AllocateNewBuffer(length);
480
+ overlap_buf_->offset_ = offset;
481
+ CopyDataToOverlapBuffer(buf, tmp_offset, tmp_length);
482
+ UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize());
483
+ }
484
+ }
485
+
486
+ // This is for when num_buffers_ > 1.
425
487
  // If data is overlapping between two buffers then during this call:
426
488
  // - data from first buffer is copied into overlapping buffer,
427
489
  // - first is removed from bufs_ and freed so that it can be used for async
428
490
  // prefetching of further data.
429
- Status FilePrefetchBuffer::HandleOverlappingData(
491
+ Status FilePrefetchBuffer::HandleOverlappingAsyncData(
430
492
  const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
431
493
  size_t length, size_t readahead_size, bool& copy_to_overlap_buffer,
432
494
  uint64_t& tmp_offset, size_t& tmp_length) {
@@ -436,7 +498,7 @@ Status FilePrefetchBuffer::HandleOverlappingData(
436
498
  }
437
499
 
438
500
  Status s;
439
- size_t alignment = reader->file()->GetRequiredBufferAlignment();
501
+ size_t alignment = GetRequiredBufferAlignment(reader);
440
502
 
441
503
  BufferInfo* buf = GetFirstBuffer();
442
504
 
@@ -470,7 +532,7 @@ Status FilePrefetchBuffer::HandleOverlappingData(
470
532
  overlap_buf_->offset_ = offset;
471
533
  copy_to_overlap_buffer = true;
472
534
 
473
- CopyDataToBuffer(buf, tmp_offset, tmp_length);
535
+ CopyDataToOverlapBuffer(buf, tmp_offset, tmp_length);
474
536
  UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize());
475
537
 
476
538
  // Call async prefetching on freed buffer since data has been consumed
@@ -495,8 +557,8 @@ Status FilePrefetchBuffer::HandleOverlappingData(
495
557
  uint64_t end_offset = start_offset, aligned_useful_len = 0;
496
558
 
497
559
  ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
498
- /*refit_tail=*/false, next_buf->offset_ + second_size,
499
- alignment,
560
+ /*refit_tail=*/false, /*use_fs_buffer=*/false,
561
+ next_buf->offset_ + second_size, alignment,
500
562
  /*length=*/0, readahead_size, start_offset,
501
563
  end_offset, read_len, aligned_useful_len);
502
564
  if (read_len > 0) {
@@ -537,7 +599,7 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
537
599
 
538
600
  TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
539
601
 
540
- size_t alignment = reader->file()->GetRequiredBufferAlignment();
602
+ size_t alignment = GetRequiredBufferAlignment(reader);
541
603
  Status s;
542
604
  uint64_t tmp_offset = offset;
543
605
  size_t tmp_length = length;
@@ -550,12 +612,20 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
550
612
  }
551
613
  ClearOutdatedData(offset, length);
552
614
 
553
- // Handle overlapping data over two buffers.
554
- s = HandleOverlappingData(opts, reader, offset, length, readahead_size,
555
- copy_to_overlap_buffer, tmp_offset, tmp_length);
615
+ // Handle overlapping data over two buffers (async prefetching case).
616
+ s = HandleOverlappingAsyncData(opts, reader, offset, length, readahead_size,
617
+ copy_to_overlap_buffer, tmp_offset,
618
+ tmp_length);
556
619
  if (!s.ok()) {
557
620
  return s;
558
621
  }
622
+ // Handle partially available data when reusing the file system buffer
623
+ // and num_buffers_ = 1 (sync prefetching case)
624
+ bool use_fs_buffer = UseFSBuffer(reader);
625
+ if (!copy_to_overlap_buffer && use_fs_buffer) {
626
+ HandleOverlappingSyncData(offset, length, tmp_offset, tmp_length,
627
+ copy_to_overlap_buffer);
628
+ }
559
629
 
560
630
  AllocateBufferIfEmpty();
561
631
  BufferInfo* buf = GetFirstBuffer();
@@ -586,8 +656,18 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
586
656
  if (copy_to_overlap_buffer) {
587
657
  // Data is overlapping i.e. some of the data has been copied to overlap
588
658
  // buffer and remaining will be updated below.
659
+ // Note: why do we not end up performing a duplicate copy when we already
660
+ // copy to the overlap buffer in HandleOverlappingAsyncData /
661
+ // HandleOverlappingSyncData? The reason is that when we call
662
+ // CopyDataToOverlapBuffer, if the buffer is only a "partial hit", then we
663
+ // clear it out since it does not have any more useful data once we copy
664
+ // to the overlap buffer. Once we reallocate a fresh buffer, that buffer
665
+ // will have no data, and it will be the "first" buffer when num_buffers_
666
+ // = 1. When num_buffers_ > 1, we call ClearOutdatedData() so we know
667
+ // that, if we get to this point in the control flow, the "front" buffer
668
+ // has to have the data we need.
589
669
  size_t initial_buf_size = overlap_buf_->CurrentSize();
590
- CopyDataToBuffer(buf, offset, length);
670
+ CopyDataToOverlapBuffer(buf, offset, length);
591
671
  UpdateStats(
592
672
  /*found_in_buffer=*/false,
593
673
  overlap_buf_->CurrentSize() - initial_buf_size);
@@ -636,10 +716,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
636
716
  UpdateStats(/*found_in_buffer=*/false,
637
717
  (buf->offset_ + buf->CurrentSize() - offset));
638
718
  }
639
- ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail*/
640
- true, start_offset1, alignment, length, readahead_size,
641
- start_offset1, end_offset1, read_len1,
642
- aligned_useful_len1);
719
+ ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/
720
+ true, /*use_fs_buffer=*/use_fs_buffer, start_offset1,
721
+ alignment, length, readahead_size, start_offset1,
722
+ end_offset1, read_len1, aligned_useful_len1);
643
723
  } else {
644
724
  UpdateStats(/*found_in_buffer=*/true, original_length);
645
725
  }
@@ -654,7 +734,8 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
654
734
  }
655
735
 
656
736
  if (read_len1 > 0) {
657
- s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1);
737
+ s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1,
738
+ use_fs_buffer);
658
739
  if (!s.ok()) {
659
740
  AbortAllIOs();
660
741
  FreeAllBuffers();
@@ -662,10 +743,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
662
743
  }
663
744
  }
664
745
 
665
- // Copy remaining requested bytes to overlap_buffer. No need to update stats
666
- // as data is prefetched during this call.
746
+ // Copy remaining requested bytes to overlap_buf_. No need to
747
+ // update stats as data is prefetched during this call.
667
748
  if (copy_to_overlap_buffer && length > 0) {
668
- CopyDataToBuffer(buf, offset, length);
749
+ CopyDataToOverlapBuffer(buf, offset, length);
669
750
  }
670
751
  return s;
671
752
  }
@@ -782,6 +863,7 @@ bool FilePrefetchBuffer::TryReadFromCacheUntracked(
782
863
  if (copy_to_overlap_buffer) {
783
864
  buf = overlap_buf_;
784
865
  }
866
+ assert(buf->offset_ <= offset);
785
867
  uint64_t offset_in_buffer = offset - buf->offset_;
786
868
  *result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n);
787
869
  if (prefetched) {
@@ -892,7 +974,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
892
974
  std::string msg;
893
975
 
894
976
  Status s;
895
- size_t alignment = reader->file()->GetRequiredBufferAlignment();
977
+ size_t alignment = GetRequiredBufferAlignment(reader);
896
978
  size_t readahead_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
897
979
  size_t offset_to_read = static_cast<size_t>(offset);
898
980
  uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0;
@@ -915,6 +997,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
915
997
  // Prefetch full data + readahead_size in the first buffer.
916
998
  if (is_eligible_for_prefetching || reader->use_direct_io()) {
917
999
  ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/false,
1000
+ /*use_fs_buffer=*/false,
918
1001
  /*prev_buf_end_offset=*/start_offset1, alignment, n,
919
1002
  readahead_size, start_offset1, end_offset1, read_len1,
920
1003
  aligned_useful_len1);
@@ -923,7 +1006,8 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
923
1006
  start_offset1 = offset_to_read;
924
1007
  end_offset1 = offset_to_read + n;
925
1008
  roundup_len1 = end_offset1 - start_offset1;
926
- PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1, false,
1009
+ PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1,
1010
+ /*refit_tail=*/false, /*use_fs_buffer=*/false,
927
1011
  aligned_useful_len1);
928
1012
  assert(aligned_useful_len1 == 0);
929
1013
  assert(roundup_len1 >= aligned_useful_len1);
@@ -970,7 +1054,7 @@ Status FilePrefetchBuffer::PrefetchRemBuffers(const IOOptions& opts,
970
1054
  uint64_t end_offset2 = start_offset2, aligned_useful_len2 = 0;
971
1055
  size_t read_len2 = 0;
972
1056
  ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
973
- /*refit_tail=*/false,
1057
+ /*refit_tail=*/false, /*use_fs_buffer=*/false,
974
1058
  /*prev_buf_end_offset=*/end_offset1, alignment,
975
1059
  /*length=*/0, readahead_size, start_offset2,
976
1060
  end_offset2, read_len2, aligned_useful_len2);