@nxtedition/rocksdb 7.1.14 → 7.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. package/binding.cc +1 -0
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +72 -18
  3. package/deps/rocksdb/rocksdb/Makefile +91 -11
  4. package/deps/rocksdb/rocksdb/TARGETS +8 -4
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +5 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +13 -8
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +116 -57
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +958 -459
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +407 -622
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +104 -40
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +23 -8
  13. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +350 -184
  14. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +12 -2
  15. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +2 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +130 -43
  17. package/deps/rocksdb/rocksdb/cache/lru_cache.h +24 -2
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +423 -98
  19. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +19 -2
  20. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +10 -7
  21. package/deps/rocksdb/rocksdb/crash_test.mk +2 -2
  22. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +46 -26
  23. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +9 -3
  24. package/deps/rocksdb/rocksdb/db/blob/blob_contents.cc +90 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +56 -0
  26. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -10
  27. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +64 -59
  28. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +11 -8
  29. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +92 -62
  30. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +159 -136
  31. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +13 -13
  32. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +129 -57
  33. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +81 -3
  34. package/deps/rocksdb/rocksdb/db/c.cc +29 -0
  35. package/deps/rocksdb/rocksdb/db/column_family.cc +10 -1
  36. package/deps/rocksdb/rocksdb/db/column_family_test.cc +21 -0
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +42 -36
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +344 -102
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +163 -28
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +52 -17
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +35 -30
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -3
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +167 -11
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +8 -8
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +10 -13
  46. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +0 -117
  47. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +6 -49
  48. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +29 -4
  49. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +18 -11
  50. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +4 -10
  51. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +1 -1
  52. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +12 -0
  53. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +144 -93
  54. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +28 -32
  55. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +1 -1
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -9
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -33
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -5
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +11 -0
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -2
  61. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -0
  62. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +2 -1
  63. package/deps/rocksdb/rocksdb/db/db_iter.cc +76 -138
  64. package/deps/rocksdb/rocksdb/db/db_iter.h +26 -23
  65. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +1 -1
  66. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +931 -0
  67. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +2 -2
  68. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -0
  69. package/deps/rocksdb/rocksdb/db/db_test2.cc +44 -22
  70. package/deps/rocksdb/rocksdb/db/db_test_util.cc +6 -14
  71. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +155 -0
  72. package/deps/rocksdb/rocksdb/db/db_write_test.cc +45 -0
  73. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -1
  74. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +8 -0
  75. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -1
  76. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +24 -12
  77. package/deps/rocksdb/rocksdb/db/internal_stats.cc +7 -1
  78. package/deps/rocksdb/rocksdb/db/internal_stats.h +3 -0
  79. package/deps/rocksdb/rocksdb/db/memtable.cc +79 -18
  80. package/deps/rocksdb/rocksdb/db/memtable.h +5 -0
  81. package/deps/rocksdb/rocksdb/db/memtable_list.cc +26 -4
  82. package/deps/rocksdb/rocksdb/db/memtable_list.h +2 -1
  83. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +113 -0
  84. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +110 -0
  85. package/deps/rocksdb/rocksdb/db/{periodic_work_scheduler_test.cc → periodic_task_scheduler_test.cc} +33 -39
  86. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +12 -20
  87. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +6 -5
  88. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +12 -8
  89. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +20 -5
  90. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +14 -0
  91. package/deps/rocksdb/rocksdb/db/repair.cc +17 -8
  92. package/deps/rocksdb/rocksdb/db/repair_test.cc +2 -1
  93. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +49 -66
  94. package/deps/rocksdb/rocksdb/db/table_cache.cc +92 -63
  95. package/deps/rocksdb/rocksdb/db/table_cache.h +16 -9
  96. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  97. package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +2 -2
  98. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -3
  99. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  100. package/deps/rocksdb/rocksdb/db/version_builder.cc +1 -1
  101. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -2
  102. package/deps/rocksdb/rocksdb/db/version_set.cc +379 -145
  103. package/deps/rocksdb/rocksdb/db/version_set.h +26 -24
  104. package/deps/rocksdb/rocksdb/db/version_set_test.cc +9 -9
  105. package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
  106. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +10 -2
  107. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +2 -0
  108. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +5 -8
  109. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +5 -8
  110. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +2 -0
  111. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +71 -0
  112. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +14 -0
  113. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +23 -0
  114. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +26 -1
  115. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +105 -34
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +16 -8
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -0
  118. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +4 -8
  119. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -8
  120. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +282 -25
  121. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  122. package/deps/rocksdb/rocksdb/env/io_posix.cc +3 -1
  123. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +367 -177
  124. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +144 -56
  125. package/deps/rocksdb/rocksdb/file/filename.cc +3 -3
  126. package/deps/rocksdb/rocksdb/file/filename.h +4 -2
  127. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +415 -0
  128. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +2 -0
  129. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +36 -45
  130. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +21 -3
  131. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +11 -11
  132. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +15 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +163 -68
  134. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +26 -12
  135. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +23 -5
  136. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +21 -17
  137. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +17 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +3 -3
  139. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +17 -6
  140. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -0
  142. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +3 -3
  143. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +4 -0
  144. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  145. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +3 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -1
  147. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +2 -1
  148. package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -2
  149. package/deps/rocksdb/rocksdb/monitoring/histogram.cc +4 -2
  150. package/deps/rocksdb/rocksdb/monitoring/histogram.h +2 -0
  151. package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +15 -1
  152. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +17 -0
  153. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +14 -3
  154. package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +3 -0
  155. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +50 -0
  156. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +1 -0
  157. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +31 -32
  158. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -1
  159. package/deps/rocksdb/rocksdb/options/options.cc +2 -2
  160. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -1
  161. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -0
  162. package/deps/rocksdb/rocksdb/src.mk +4 -2
  163. package/deps/rocksdb/rocksdb/table/block_based/block.h +9 -8
  164. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +110 -99
  165. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +12 -10
  166. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +11 -2
  167. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +138 -83
  168. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +25 -24
  169. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +31 -30
  170. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -13
  171. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +4 -4
  172. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -3
  173. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +3 -3
  174. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +17 -19
  175. package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
  176. package/deps/rocksdb/rocksdb/table/format.cc +26 -29
  177. package/deps/rocksdb/rocksdb/table/format.h +44 -26
  178. package/deps/rocksdb/rocksdb/table/get_context.cc +17 -12
  179. package/deps/rocksdb/rocksdb/table/internal_iterator.h +7 -0
  180. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +4 -0
  181. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +950 -104
  182. package/deps/rocksdb/rocksdb/table/merging_iterator.h +28 -1
  183. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +3 -2
  184. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -1
  185. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +10 -9
  186. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +22 -20
  187. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +1 -1
  188. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +1 -1
  189. package/deps/rocksdb/rocksdb/table/table_builder.h +9 -21
  190. package/deps/rocksdb/rocksdb/table/table_test.cc +12 -12
  191. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +4 -4
  192. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +1 -0
  193. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +116 -34
  194. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +6 -1
  195. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +1 -1
  196. package/deps/rocksdb/rocksdb/util/autovector.h +12 -0
  197. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +3 -2
  198. package/deps/rocksdb/rocksdb/util/stderr_logger.cc +30 -0
  199. package/deps/rocksdb/rocksdb/util/stderr_logger.h +5 -18
  200. package/deps/rocksdb/rocksdb/util/timer.h +2 -3
  201. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +9 -2
  202. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +1 -1
  203. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +1 -1
  204. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +34 -53
  205. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +9 -14
  206. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -4
  207. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +4 -0
  208. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +1 -1
  209. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +4 -3
  210. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +3 -1
  211. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +26 -8
  212. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +114 -16
  213. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
  214. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +59 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +3 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +39 -0
  217. package/deps/rocksdb/rocksdb.gyp +0 -1
  218. package/index.js +6 -10
  219. package/package.json +1 -1
  220. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  221. package/prebuilds/linux-x64/node.napi.node +0 -0
  222. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +0 -168
  223. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +0 -90
@@ -10,6 +10,7 @@
10
10
  #include "file/file_prefetch_buffer.h"
11
11
 
12
12
  #include <algorithm>
13
+ #include <cassert>
13
14
 
14
15
  #include "file/random_access_file_reader.h"
15
16
  #include "monitoring/histogram.h"
@@ -23,8 +24,8 @@ namespace ROCKSDB_NAMESPACE {
23
24
 
24
25
  void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
25
26
  uint64_t offset,
26
- size_t roundup_len, size_t index,
27
- bool refit_tail,
27
+ size_t roundup_len,
28
+ uint32_t index, bool refit_tail,
28
29
  uint64_t& chunk_len) {
29
30
  uint64_t chunk_offset_in_buffer = 0;
30
31
  bool copy_data_to_new_buffer = false;
@@ -32,9 +33,7 @@ void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
32
33
  // If only a few bytes exist -- reuse them & read only what is really needed.
33
34
  // This is typically the case of incremental reading of data.
34
35
  // If no bytes exist in buffer -- full pread.
35
- if (bufs_[index].buffer_.CurrentSize() > 0 &&
36
- offset >= bufs_[index].offset_ &&
37
- offset <= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()) {
36
+ if (DoesBufferContainData(index) && IsOffsetInBuffer(offset, index)) {
38
37
  // Only a few requested bytes are in the buffer. memmove those chunk of
39
38
  // bytes to the beginning, and memcpy them back into the new buffer if a
40
39
  // new buffer is created.
@@ -43,7 +42,7 @@ void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
43
42
  chunk_len = static_cast<uint64_t>(bufs_[index].buffer_.CurrentSize()) -
44
43
  chunk_offset_in_buffer;
45
44
  assert(chunk_offset_in_buffer % alignment == 0);
46
- // assert(chunk_len % alignment == 0);
45
+ assert(chunk_len % alignment == 0);
47
46
  assert(chunk_offset_in_buffer + chunk_len <=
48
47
  bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
49
48
  if (chunk_len > 0) {
@@ -108,7 +107,7 @@ Status FilePrefetchBuffer::Read(const IOOptions& opts,
108
107
 
109
108
  Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts,
110
109
  RandomAccessFileReader* reader,
111
- uint64_t read_len, uint64_t chunk_len,
110
+ uint64_t read_len,
112
111
  uint64_t rounddown_start, uint32_t index) {
113
112
  // callback for async read request.
114
113
  auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this,
@@ -116,15 +115,18 @@ Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts,
116
115
  FSReadRequest req;
117
116
  Slice result;
118
117
  req.len = read_len;
119
- req.offset = rounddown_start + chunk_len;
118
+ req.offset = rounddown_start;
120
119
  req.result = result;
121
- req.scratch = bufs_[index].buffer_.BufferStart() + chunk_len;
122
- Status s = reader->ReadAsync(req, opts, fp,
123
- /*cb_arg=*/nullptr, &io_handle_, &del_fn_,
124
- /*aligned_buf=*/nullptr);
120
+ req.scratch = bufs_[index].buffer_.BufferStart();
121
+ bufs_[index].async_req_len_ = req.len;
122
+
123
+ Status s =
124
+ reader->ReadAsync(req, opts, fp, &(bufs_[index].pos_),
125
+ &(bufs_[index].io_handle_), &(bufs_[index].del_fn_),
126
+ /*aligned_buf=*/nullptr);
125
127
  req.status.PermitUncheckedError();
126
128
  if (s.ok()) {
127
- async_read_in_progress_ = true;
129
+ bufs_[index].async_read_in_progress_ = true;
128
130
  }
129
131
  return s;
130
132
  }
@@ -170,8 +172,7 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
170
172
  }
171
173
  uint64_t copy_offset = (offset - bufs_[src].offset_);
172
174
  size_t copy_len = 0;
173
- if (offset + length <=
174
- bufs_[src].offset_ + bufs_[src].buffer_.CurrentSize()) {
175
+ if (IsDataBlockInBuffer(offset, length, src)) {
175
176
  // All the bytes are in src.
176
177
  copy_len = length;
177
178
  } else {
@@ -194,65 +195,121 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
194
195
  }
195
196
  }
196
197
 
197
- void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
198
- if (async_read_in_progress_ && fs_ != nullptr) {
199
- // Wait for prefetch data to complete.
200
- // No mutex is needed as PrefetchAsyncCallback updates the result in second
201
- // buffer and FilePrefetchBuffer should wait for Poll before accessing the
202
- // second buffer.
203
- std::vector<void*> handles;
204
- handles.emplace_back(io_handle_);
205
- StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
206
- fs_->Poll(handles, 1).PermitUncheckedError();
207
- }
208
-
209
- // Reset and Release io_handle_ after the Poll API as request has been
210
- // completed.
211
- async_read_in_progress_ = false;
212
- if (io_handle_ != nullptr && del_fn_ != nullptr) {
213
- del_fn_(io_handle_);
214
- io_handle_ = nullptr;
215
- del_fn_ = nullptr;
216
- }
217
-
218
- // Index of second buffer.
198
+ // Clear the buffers if it contains outdated data. Outdated data can be
199
+ // because previous sequential reads were read from the cache instead of these
200
+ // buffer. In that case outdated IOs should be aborted.
201
+ void FilePrefetchBuffer::AbortIOIfNeeded(uint64_t offset) {
219
202
  uint32_t second = curr_ ^ 1;
203
+ std::vector<void*> handles;
204
+ autovector<uint32_t> buf_pos;
205
+ if (IsBufferOutdatedWithAsyncProgress(offset, curr_)) {
206
+ handles.emplace_back(bufs_[curr_].io_handle_);
207
+ buf_pos.emplace_back(curr_);
208
+ }
209
+ if (IsBufferOutdatedWithAsyncProgress(offset, second)) {
210
+ handles.emplace_back(bufs_[second].io_handle_);
211
+ buf_pos.emplace_back(second);
212
+ }
213
+ if (!handles.empty()) {
214
+ StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
215
+ Status s = fs_->AbortIO(handles);
216
+ assert(s.ok());
217
+ }
220
218
 
221
- // First clear the buffers if it contains outdated data. Outdated data can be
222
- // because previous sequential reads were read from the cache instead of these
223
- // buffer.
224
- {
225
- if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
226
- offset >= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
227
- bufs_[curr_].buffer_.Clear();
228
- }
229
- if (bufs_[second].buffer_.CurrentSize() > 0 &&
230
- offset >= bufs_[second].offset_ + bufs_[second].buffer_.CurrentSize()) {
231
- bufs_[second].buffer_.Clear();
219
+ for (auto& pos : buf_pos) {
220
+ // Release io_handle.
221
+ DestroyAndClearIOHandle(pos);
222
+ }
223
+
224
+ if (bufs_[second].io_handle_ == nullptr) {
225
+ bufs_[second].async_read_in_progress_ = false;
226
+ }
227
+
228
+ if (bufs_[curr_].io_handle_ == nullptr &&
229
+ bufs_[curr_].async_read_in_progress_) {
230
+ bufs_[curr_].async_read_in_progress_ = false;
231
+ curr_ = curr_ ^ 1;
232
+ }
233
+ }
234
+
235
+ void FilePrefetchBuffer::AbortAllIOs() {
236
+ uint32_t second = curr_ ^ 1;
237
+ std::vector<void*> handles;
238
+ for (uint32_t i = 0; i < 2; i++) {
239
+ if (bufs_[i].async_read_in_progress_ && bufs_[i].io_handle_ != nullptr) {
240
+ handles.emplace_back(bufs_[i].io_handle_);
232
241
  }
233
242
  }
243
+ if (!handles.empty()) {
244
+ StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
245
+ Status s = fs_->AbortIO(handles);
246
+ assert(s.ok());
247
+ }
234
248
 
235
- // If data is in second buffer, make it curr_. Second buffer can be either
236
- // partial filled or full.
237
- if (bufs_[second].buffer_.CurrentSize() > 0 &&
238
- offset >= bufs_[second].offset_ &&
239
- offset < bufs_[second].offset_ + bufs_[second].buffer_.CurrentSize()) {
249
+ // Release io_handles.
250
+ if (bufs_[curr_].io_handle_ != nullptr && bufs_[curr_].del_fn_ != nullptr) {
251
+ DestroyAndClearIOHandle(curr_);
252
+ }
253
+
254
+ if (bufs_[second].io_handle_ != nullptr && bufs_[second].del_fn_ != nullptr) {
255
+ DestroyAndClearIOHandle(second);
256
+ }
257
+ }
258
+
259
+ // Clear the buffers if it contains outdated data. Outdated data can be
260
+ // because previous sequential reads were read from the cache instead of these
261
+ // buffer.
262
+ void FilePrefetchBuffer::UpdateBuffersIfNeeded(uint64_t offset) {
263
+ uint32_t second = curr_ ^ 1;
264
+ if (IsBufferOutdated(offset, curr_)) {
265
+ bufs_[curr_].buffer_.Clear();
266
+ }
267
+ if (IsBufferOutdated(offset, second)) {
268
+ bufs_[second].buffer_.Clear();
269
+ }
270
+
271
+ // If data starts from second buffer, make it curr_. Second buffer can be
272
+ // either partial filled or full.
273
+ if (!bufs_[second].async_read_in_progress_ && DoesBufferContainData(second) &&
274
+ IsOffsetInBuffer(offset, second)) {
240
275
  // Clear the curr_ as buffers have been swapped and curr_ contains the
241
276
  // outdated data and switch the buffers.
242
- bufs_[curr_].buffer_.Clear();
277
+ if (!bufs_[curr_].async_read_in_progress_) {
278
+ bufs_[curr_].buffer_.Clear();
279
+ }
243
280
  curr_ = curr_ ^ 1;
244
281
  }
245
282
  }
246
283
 
247
- // If async_read = true:
248
- // async_read is enabled in case of sequential reads. So when
249
- // buffers are switched, we clear the curr_ buffer as we assume the data has
250
- // been consumed because of sequential reads.
284
+ void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
285
+ if (bufs_[curr_].async_read_in_progress_ && fs_ != nullptr) {
286
+ if (bufs_[curr_].io_handle_ != nullptr) {
287
+ // Wait for prefetch data to complete.
288
+ // No mutex is needed as async_read_in_progress behaves as mutex and is
289
+ // updated by main thread only.
290
+ std::vector<void*> handles;
291
+ handles.emplace_back(bufs_[curr_].io_handle_);
292
+ StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
293
+ fs_->Poll(handles, 1).PermitUncheckedError();
294
+ }
295
+
296
+ // Reset and Release io_handle after the Poll API as request has been
297
+ // completed.
298
+ DestroyAndClearIOHandle(curr_);
299
+ }
300
+ UpdateBuffersIfNeeded(offset);
301
+ }
302
+
303
+ // If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is
304
+ // called. When buffers are switched, we clear the curr_ buffer as we assume the
305
+ // data has been consumed because of sequential reads.
306
+ // Data in buffers will always be sequential with curr_ following second and
307
+ // not vice versa.
251
308
  //
252
309
  // Scenarios for prefetching asynchronously:
253
- // Case1: If both buffers are empty, prefetch n bytes
254
- // synchronously in curr_
255
- // and prefetch readahead_size_/2 async in second buffer.
310
+ // Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes
311
+ // synchronously in curr_ and prefetch readahead_size_/2 async in second
312
+ // buffer.
256
313
  // Case2: If second buffer has partial or full data, make it current and
257
314
  // prefetch readahead_size_/2 async in second buffer. In case of
258
315
  // partial data, prefetch remaining bytes from size n synchronously to
@@ -260,9 +317,10 @@ void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
260
317
  // Case3: If curr_ has partial data, prefetch remaining bytes from size n
261
318
  // synchronously in curr_ to fulfill the requested bytes request and
262
319
  // prefetch readahead_size_/2 bytes async in second buffer.
263
- // Case4: If data is in both buffers, copy requested data from curr_ and second
264
- // buffer to third buffer. If all requested bytes have been copied, do
265
- // the asynchronous prefetching in second buffer.
320
+ // Case4: (Special case) If data is in both buffers, copy requested data from
321
+ // curr_, send async request on curr_, wait for poll to fill second
322
+ // buffer (if any), and copy remaining data from second buffer to third
323
+ // buffer.
266
324
  Status FilePrefetchBuffer::PrefetchAsyncInternal(
267
325
  const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
268
326
  size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
@@ -273,39 +331,30 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
273
331
 
274
332
  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
275
333
 
276
- PollAndUpdateBuffersIfNeeded(offset);
277
-
278
- // If all the requested bytes are in curr_, it will go for async prefetching
279
- // only.
280
- if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
281
- offset + length <=
282
- bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
283
- offset += length;
284
- length = 0;
334
+ size_t alignment = reader->file()->GetRequiredBufferAlignment();
335
+ Status s;
336
+ uint64_t tmp_offset = offset;
337
+ size_t tmp_length = length;
285
338
 
286
- // Since async request was submitted directly by calling PrefetchAsync in
287
- // last call, we don't need to prefetch further as this call is to poll the
288
- // data submitted in previous call.
289
- if (async_request_submitted_) {
290
- return Status::OK();
339
+ // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with
340
+ // data.
341
+ {
342
+ if (!explicit_prefetch_submitted_) {
343
+ AbortIOIfNeeded(offset);
291
344
  }
345
+ UpdateBuffersIfNeeded(offset);
292
346
  }
293
-
294
- async_request_submitted_ = false;
295
-
296
- Status s;
297
- size_t prefetch_size = length + readahead_size;
298
- size_t alignment = reader->file()->GetRequiredBufferAlignment();
299
- // Index of second buffer.
300
347
  uint32_t second = curr_ ^ 1;
301
348
 
302
- // Data is overlapping i.e. some of the data is in curr_ buffer and remaining
303
- // in second buffer.
304
- if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
305
- bufs_[second].buffer_.CurrentSize() > 0 &&
306
- offset >= bufs_[curr_].offset_ &&
307
- offset < bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() &&
308
- offset + length > bufs_[second].offset_) {
349
+ // 2. If data is overlapping over two buffers, copy the data from curr_ and
350
+ // call ReadAsync on curr_.
351
+ if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
352
+ IsOffsetInBuffer(offset, curr_) &&
353
+ (/*Data extends over curr_ buffer and second buffer either has data or in
354
+ process of population=*/
355
+ (offset + length > bufs_[second].offset_) &&
356
+ (bufs_[second].async_read_in_progress_ ||
357
+ DoesBufferContainData(second)))) {
309
358
  // Allocate new buffer to third buffer;
310
359
  bufs_[2].buffer_.Clear();
311
360
  bufs_[2].buffer_.Alignment(alignment);
@@ -313,25 +362,92 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
313
362
  bufs_[2].offset_ = offset;
314
363
  copy_to_third_buffer = true;
315
364
 
316
- // Move data from curr_ buffer to third.
317
- CopyDataToBuffer(curr_, offset, length);
318
- if (length == 0) {
319
- // Requested data has been copied and curr_ still has unconsumed data.
365
+ CopyDataToBuffer(curr_, tmp_offset, tmp_length);
366
+
367
+ // Call async prefetching on curr_ since data has been consumed in curr_
368
+ // only if data lies within second buffer.
369
+ size_t second_size = bufs_[second].async_read_in_progress_
370
+ ? bufs_[second].async_req_len_
371
+ : bufs_[second].buffer_.CurrentSize();
372
+ if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size) {
373
+ uint64_t rounddown_start = bufs_[second].offset_ + second_size;
374
+ uint64_t roundup_end =
375
+ Roundup(rounddown_start + readahead_size, alignment);
376
+ uint64_t roundup_len = roundup_end - rounddown_start;
377
+ uint64_t chunk_len = 0;
378
+ CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, curr_,
379
+ false, chunk_len);
380
+ assert(chunk_len == 0);
381
+ assert(roundup_len >= chunk_len);
382
+
383
+ bufs_[curr_].offset_ = rounddown_start;
384
+ uint64_t read_len = static_cast<size_t>(roundup_len - chunk_len);
385
+ s = ReadAsync(opts, reader, read_len, rounddown_start, curr_);
386
+ if (!s.ok()) {
387
+ DestroyAndClearIOHandle(curr_);
388
+ bufs_[curr_].buffer_.Clear();
389
+ return s;
390
+ }
391
+ }
392
+ curr_ = curr_ ^ 1;
393
+ }
394
+
395
+ // 3. Call Poll only if data is needed for the second buffer.
396
+ // - Return if whole data is in curr_ and second buffer in progress.
397
+ // - If second buffer is empty, it will go for ReadAsync for second buffer.
398
+ if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
399
+ IsDataBlockInBuffer(offset, length, curr_)) {
400
+ // Whole data is in curr_.
401
+ UpdateBuffersIfNeeded(offset);
402
+ second = curr_ ^ 1;
403
+ if (bufs_[second].async_read_in_progress_) {
320
404
  return s;
321
405
  }
322
- CopyDataToBuffer(second, offset, length);
323
- // Length == 0: All the requested data has been copied to third buffer. It
324
- // should go for only async prefetching.
406
+ } else {
407
+ PollAndUpdateBuffersIfNeeded(offset);
408
+ second = curr_ ^ 1;
409
+ }
410
+
411
+ if (copy_to_third_buffer) {
412
+ offset = tmp_offset;
413
+ length = tmp_length;
414
+ }
415
+
416
+ // 4. After polling and swapping buffers, if all the requested bytes are in
417
+ // curr_, it will only go for async prefetching.
418
+ // copy_to_third_buffer is a special case so it will be handled separately.
419
+ if (!copy_to_third_buffer && DoesBufferContainData(curr_) &&
420
+ IsDataBlockInBuffer(offset, length, curr_)) {
421
+ offset += length;
422
+ length = 0;
423
+
424
+ // Since async request was submitted directly by calling PrefetchAsync in
425
+ // last call, we don't need to prefetch further as this call is to poll
426
+ // the data submitted in previous call.
427
+ if (explicit_prefetch_submitted_) {
428
+ return s;
429
+ }
430
+ }
431
+
432
+ // 5. Data is overlapping i.e. some of the data has been copied to third
433
+ // buffer
434
+ // and remaining will be updated below.
435
+ if (copy_to_third_buffer) {
436
+ CopyDataToBuffer(curr_, offset, length);
437
+
438
+ // Length == 0: All the requested data has been copied to third buffer and
439
+ // it has already gone for async prefetching. It can return without doing
440
+ // anything further.
325
441
  // Length > 0: More data needs to be consumed so it will continue async and
326
442
  // sync prefetching and copy the remaining data to third buffer in the end.
327
- // swap the buffers.
328
- curr_ = curr_ ^ 1;
329
- // Update prefetch_size as length has been updated in CopyDataToBuffer.
330
- prefetch_size = length + readahead_size;
443
+ if (length == 0) {
444
+ return s;
445
+ }
331
446
  }
332
447
 
448
+ // 6. Go for ReadAsync and Read (if needed).
449
+ size_t prefetch_size = length + readahead_size;
333
450
  size_t _offset = static_cast<size_t>(offset);
334
- second = curr_ ^ 1;
335
451
 
336
452
  // offset and size alignment for curr_ buffer with synchronous prefetching
337
453
  uint64_t rounddown_start1 = Rounddown(_offset, alignment);
@@ -368,19 +484,34 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
368
484
  uint64_t chunk_len2 = 0;
369
485
  CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second,
370
486
  false /*refit_tail*/, chunk_len2);
371
-
487
+ assert(chunk_len2 == 0);
372
488
  // Update the buffer offset.
373
489
  bufs_[second].offset_ = rounddown_start2;
374
490
  assert(roundup_len2 >= chunk_len2);
375
491
  uint64_t read_len2 = static_cast<size_t>(roundup_len2 - chunk_len2);
376
- ReadAsync(opts, reader, read_len2, chunk_len2, rounddown_start2, second)
377
- .PermitUncheckedError();
492
+ Status tmp_s = ReadAsync(opts, reader, read_len2, rounddown_start2, second);
493
+ if (!tmp_s.ok()) {
494
+ DestroyAndClearIOHandle(second);
495
+ bufs_[second].buffer_.Clear();
496
+ }
378
497
  }
379
498
 
380
499
  if (read_len1 > 0) {
381
500
  s = Read(opts, reader, rate_limiter_priority, read_len1, chunk_len1,
382
501
  rounddown_start1, curr_);
383
502
  if (!s.ok()) {
503
+ if (bufs_[second].io_handle_ != nullptr) {
504
+ std::vector<void*> handles;
505
+ handles.emplace_back(bufs_[second].io_handle_);
506
+ {
507
+ StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
508
+ Status status = fs_->AbortIO(handles);
509
+ assert(status.ok());
510
+ }
511
+ }
512
+ DestroyAndClearIOHandle(second);
513
+ bufs_[second].buffer_.Clear();
514
+ bufs_[curr_].buffer_.Clear();
384
515
  return s;
385
516
  }
386
517
  }
@@ -462,12 +593,18 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
462
593
  return false;
463
594
  }
464
595
 
465
- // In case of async_io_, offset can be less than bufs_[curr_].offset_ because
466
- // of reads not sequential and PrefetchAsync can be called for any block and
467
- // RocksDB will call TryReadFromCacheAsync after PrefetchAsync to Poll for
468
- // requested bytes.
469
- if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_ &&
470
- prev_len_ != 0) {
596
+ if (explicit_prefetch_submitted_) {
597
+ if (prev_offset_ != offset) {
598
+ // Random offset called. So abort the IOs.
599
+ AbortAllIOs();
600
+ bufs_[curr_].buffer_.Clear();
601
+ bufs_[curr_ ^ 1].buffer_.Clear();
602
+ explicit_prefetch_submitted_ = false;
603
+ return false;
604
+ }
605
+ }
606
+
607
+ if (!explicit_prefetch_submitted_ && offset < bufs_[curr_].offset_) {
471
608
  return false;
472
609
  }
473
610
 
@@ -479,8 +616,11 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
479
616
  // If readahead is not enabled: return false.
480
617
  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
481
618
  &readahead_size_);
482
- if (offset < bufs_[curr_].offset_ ||
483
- offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
619
+
620
+ if (explicit_prefetch_submitted_ ||
621
+ (bufs_[curr_].async_read_in_progress_ ||
622
+ offset + n >
623
+ bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize())) {
484
624
  if (readahead_size_ > 0) {
485
625
  Status s;
486
626
  assert(reader != nullptr);
@@ -493,11 +633,11 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
493
633
  return false;
494
634
  }
495
635
  }
496
-
497
636
  // Prefetch n + readahead_size_/2 synchronously as remaining
498
637
  // readahead_size_/2 will be prefetched asynchronously.
499
638
  s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2,
500
639
  rate_limiter_priority, copy_to_third_buffer);
640
+ explicit_prefetch_submitted_ = false;
501
641
  if (!s.ok()) {
502
642
  if (status) {
503
643
  *status = s;
@@ -507,11 +647,12 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
507
647
  #endif
508
648
  return false;
509
649
  }
510
- prefetched = async_request_submitted_ ? false : true;
650
+ prefetched = explicit_prefetch_submitted_ ? false : true;
511
651
  } else {
512
652
  return false;
513
653
  }
514
654
  }
655
+
515
656
  UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
516
657
 
517
658
  uint32_t index = curr_;
@@ -523,14 +664,12 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
523
664
  if (prefetched) {
524
665
  readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
525
666
  }
526
- async_request_submitted_ = false;
527
667
  return true;
528
668
  }
529
669
 
530
670
  void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req,
531
- void* /*cb_arg*/) {
532
- uint32_t index = curr_ ^ 1;
533
-
671
+ void* cb_arg) {
672
+ uint32_t index = *(static_cast<uint32_t*>(cb_arg));
534
673
  #ifndef NDEBUG
535
674
  if (req.result.size() < req.len) {
536
675
  // Fake an IO error to force db_stress fault injection to ignore
@@ -565,82 +704,133 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
565
704
  if (!enable_) {
566
705
  return Status::NotSupported();
567
706
  }
707
+
568
708
  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
569
709
 
570
- PollAndUpdateBuffersIfNeeded(offset);
710
+ num_file_reads_ = 0;
711
+ explicit_prefetch_submitted_ = false;
712
+ bool is_eligible_for_prefetching = false;
713
+ if (readahead_size_ > 0 &&
714
+ (!implicit_auto_readahead_ ||
715
+ num_file_reads_ + 1 >= num_file_reads_for_auto_readahead_)) {
716
+ is_eligible_for_prefetching = true;
717
+ }
571
718
 
572
- // Index of second buffer.
573
- uint32_t second = curr_ ^ 1;
719
+ // 1. Cancel any pending async read to make code simpler as buffers can be out
720
+ // of sync.
721
+ AbortAllIOs();
574
722
 
723
+ // 2. Clear outdated data.
724
+ UpdateBuffersIfNeeded(offset);
725
+ uint32_t second = curr_ ^ 1;
575
726
  // Since PrefetchAsync can be called on non sequential reads. So offset can
576
- // be less than buffers' offset. In that case it clears the buffer and
577
- // prefetch that block.
578
- if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_) {
727
+ // be less than curr_ buffers' offset. In that case also it clears both
728
+ // buffers.
729
+ if (DoesBufferContainData(curr_) && !IsOffsetInBuffer(offset, curr_)) {
579
730
  bufs_[curr_].buffer_.Clear();
731
+ bufs_[second].buffer_.Clear();
580
732
  }
581
733
 
582
- // All requested bytes are already in the curr_ buffer. So no need to Read
583
- // again.
584
- if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
585
- offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
734
+ UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
735
+
736
+ bool data_found = false;
737
+
738
+ // 3. If curr_ has full data.
739
+ if (DoesBufferContainData(curr_) && IsDataBlockInBuffer(offset, n, curr_)) {
586
740
  uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
587
741
  *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
588
- return Status::OK();
742
+ data_found = true;
743
+ // Update num_file_reads_ as TryReadFromCacheAsync won't be called for
744
+ // poll and update num_file_reads_ if data is found.
745
+ num_file_reads_++;
746
+
747
+ // 3.1 If second also has some data or is not eligible for prefetching,
748
+ // return.
749
+ if (!is_eligible_for_prefetching || DoesBufferContainData(second)) {
750
+ return Status::OK();
751
+ }
752
+ } else {
753
+ // Partial data in curr_.
754
+ bufs_[curr_].buffer_.Clear();
589
755
  }
756
+ bufs_[second].buffer_.Clear();
590
757
 
591
758
  Status s;
592
759
  size_t alignment = reader->file()->GetRequiredBufferAlignment();
593
-
594
- // TODO akanksha: Handle the scenario if data is overlapping in 2 buffers.
595
- // Currently, tt covers 2 scenarios. Either one buffer (curr_) has no data or
596
- // it has partial data. It ignores the contents in second buffer (overlapping
597
- // data in 2 buffers) and send the request to re-read that data again.
598
-
599
- // Clear the second buffer in order to do asynchronous prefetching.
600
- bufs_[second].buffer_.Clear();
601
-
760
+ size_t prefetch_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
602
761
  size_t offset_to_read = static_cast<size_t>(offset);
603
- uint64_t rounddown_start = 0;
604
- uint64_t roundup_end = 0;
605
-
606
- if (bufs_[curr_].buffer_.CurrentSize() == 0) {
607
- // Prefetch full data.
608
- rounddown_start = Rounddown(offset_to_read, alignment);
609
- roundup_end = Roundup(offset_to_read + n, alignment);
610
- } else {
611
- // Prefetch remaining data.
612
- size_t rem_length = n - (bufs_[curr_].buffer_.CurrentSize() -
613
- (offset - bufs_[curr_].offset_));
614
- rounddown_start = bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
615
- roundup_end = Roundup(rounddown_start + rem_length, alignment);
762
+ uint64_t rounddown_start1 = 0;
763
+ uint64_t roundup_end1 = 0;
764
+ uint64_t rounddown_start2 = 0;
765
+ uint64_t roundup_end2 = 0;
766
+ uint64_t chunk_len1 = 0;
767
+ uint64_t chunk_len2 = 0;
768
+ size_t read_len1 = 0;
769
+ size_t read_len2 = 0;
770
+
771
+ // - If curr_ is empty.
772
+ // - Call async read for full data + prefetch_size on curr_.
773
+ // - Call async read for prefetch_size on second if eligible.
774
+ // - If curr_ is filled.
775
+ // - prefetch_size on second.
776
+ // Calculate length and offsets for reading.
777
+ if (!DoesBufferContainData(curr_)) {
778
+ // Prefetch full data + prefetch_size in curr_.
779
+ rounddown_start1 = Rounddown(offset_to_read, alignment);
780
+ roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment);
781
+ uint64_t roundup_len1 = roundup_end1 - rounddown_start1;
782
+ assert(roundup_len1 >= alignment);
783
+ assert(roundup_len1 % alignment == 0);
784
+
785
+ CalculateOffsetAndLen(alignment, rounddown_start1, roundup_len1, curr_,
786
+ false, chunk_len1);
787
+ assert(chunk_len1 == 0);
788
+ assert(roundup_len1 >= chunk_len1);
789
+ read_len1 = static_cast<size_t>(roundup_len1 - chunk_len1);
790
+ bufs_[curr_].offset_ = rounddown_start1;
616
791
  }
617
792
 
618
- uint64_t roundup_len = roundup_end - rounddown_start;
619
- assert(roundup_len >= alignment);
620
- assert(roundup_len % alignment == 0);
621
-
622
- uint64_t chunk_len = 0;
623
- CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, second, false,
624
- chunk_len);
625
-
626
- // Update the buffer offset.
627
- bufs_[second].offset_ = rounddown_start;
628
- assert(roundup_len >= chunk_len);
629
-
630
- size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
793
+ if (is_eligible_for_prefetching) {
794
+ if (DoesBufferContainData(curr_)) {
795
+ rounddown_start2 =
796
+ bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
797
+ } else {
798
+ rounddown_start2 = roundup_end1;
799
+ }
631
800
 
632
- s = ReadAsync(opts, reader, read_len, chunk_len, rounddown_start, second);
801
+ roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment);
802
+ uint64_t roundup_len2 = roundup_end2 - rounddown_start2;
633
803
 
634
- if (!s.ok()) {
635
- return s;
804
+ assert(roundup_len2 >= alignment);
805
+ CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second,
806
+ false, chunk_len2);
807
+ assert(chunk_len2 == 0);
808
+ assert(roundup_len2 >= chunk_len2);
809
+ read_len2 = static_cast<size_t>(roundup_len2 - chunk_len2);
810
+ // Update the buffer offset.
811
+ bufs_[second].offset_ = rounddown_start2;
636
812
  }
637
813
 
638
- // Update read pattern so that TryReadFromCacheAsync call be called to Poll
639
- // the data. It will return without polling if blocks are not sequential.
640
- UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
641
- prev_len_ = 0;
642
- async_request_submitted_ = true;
643
-
644
- return Status::TryAgain();
814
+ if (read_len1) {
815
+ s = ReadAsync(opts, reader, read_len1, rounddown_start1, curr_);
816
+ if (!s.ok()) {
817
+ DestroyAndClearIOHandle(curr_);
818
+ bufs_[curr_].buffer_.Clear();
819
+ return s;
820
+ }
821
+ explicit_prefetch_submitted_ = true;
822
+ prev_len_ = 0;
823
+ }
824
+ if (read_len2) {
825
+ s = ReadAsync(opts, reader, read_len2, rounddown_start2, second);
826
+ if (!s.ok()) {
827
+ DestroyAndClearIOHandle(second);
828
+ bufs_[second].buffer_.Clear();
829
+ return s;
830
+ }
831
+ readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
832
+ }
833
+ return (data_found ? Status::OK() : Status::TryAgain());
645
834
  }
835
+
646
836
  } // namespace ROCKSDB_NAMESPACE