@nxtedition/rocksdb 7.1.14 → 7.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. package/binding.cc +1 -0
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +72 -18
  3. package/deps/rocksdb/rocksdb/Makefile +91 -11
  4. package/deps/rocksdb/rocksdb/TARGETS +8 -4
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +5 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +13 -8
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +116 -57
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +958 -459
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +407 -622
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +104 -40
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +23 -8
  13. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +350 -184
  14. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +12 -2
  15. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +2 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +130 -43
  17. package/deps/rocksdb/rocksdb/cache/lru_cache.h +24 -2
  18. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +423 -98
  19. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +19 -2
  20. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +10 -7
  21. package/deps/rocksdb/rocksdb/crash_test.mk +2 -2
  22. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +46 -26
  23. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +9 -3
  24. package/deps/rocksdb/rocksdb/db/blob/blob_contents.cc +90 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +56 -0
  26. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -10
  27. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +64 -59
  28. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +11 -8
  29. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +92 -62
  30. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +159 -136
  31. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +13 -13
  32. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +129 -57
  33. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +81 -3
  34. package/deps/rocksdb/rocksdb/db/c.cc +29 -0
  35. package/deps/rocksdb/rocksdb/db/column_family.cc +10 -1
  36. package/deps/rocksdb/rocksdb/db/column_family_test.cc +21 -0
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +42 -36
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +344 -102
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +163 -28
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +52 -17
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +35 -30
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -3
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +167 -11
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +8 -8
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +10 -13
  46. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +0 -117
  47. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +6 -49
  48. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +29 -4
  49. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +18 -11
  50. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +4 -10
  51. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +1 -1
  52. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +12 -0
  53. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +144 -93
  54. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +28 -32
  55. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +1 -1
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -9
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -33
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -5
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +11 -0
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -2
  61. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -0
  62. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +2 -1
  63. package/deps/rocksdb/rocksdb/db/db_iter.cc +76 -138
  64. package/deps/rocksdb/rocksdb/db/db_iter.h +26 -23
  65. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +1 -1
  66. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +931 -0
  67. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +2 -2
  68. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -0
  69. package/deps/rocksdb/rocksdb/db/db_test2.cc +44 -22
  70. package/deps/rocksdb/rocksdb/db/db_test_util.cc +6 -14
  71. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +155 -0
  72. package/deps/rocksdb/rocksdb/db/db_write_test.cc +45 -0
  73. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -1
  74. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +8 -0
  75. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -1
  76. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +24 -12
  77. package/deps/rocksdb/rocksdb/db/internal_stats.cc +7 -1
  78. package/deps/rocksdb/rocksdb/db/internal_stats.h +3 -0
  79. package/deps/rocksdb/rocksdb/db/memtable.cc +79 -18
  80. package/deps/rocksdb/rocksdb/db/memtable.h +5 -0
  81. package/deps/rocksdb/rocksdb/db/memtable_list.cc +26 -4
  82. package/deps/rocksdb/rocksdb/db/memtable_list.h +2 -1
  83. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +113 -0
  84. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +110 -0
  85. package/deps/rocksdb/rocksdb/db/{periodic_work_scheduler_test.cc → periodic_task_scheduler_test.cc} +33 -39
  86. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +12 -20
  87. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +6 -5
  88. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +12 -8
  89. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +20 -5
  90. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +14 -0
  91. package/deps/rocksdb/rocksdb/db/repair.cc +17 -8
  92. package/deps/rocksdb/rocksdb/db/repair_test.cc +2 -1
  93. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +49 -66
  94. package/deps/rocksdb/rocksdb/db/table_cache.cc +92 -63
  95. package/deps/rocksdb/rocksdb/db/table_cache.h +16 -9
  96. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
  97. package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +2 -2
  98. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -3
  99. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
  100. package/deps/rocksdb/rocksdb/db/version_builder.cc +1 -1
  101. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -2
  102. package/deps/rocksdb/rocksdb/db/version_set.cc +379 -145
  103. package/deps/rocksdb/rocksdb/db/version_set.h +26 -24
  104. package/deps/rocksdb/rocksdb/db/version_set_test.cc +9 -9
  105. package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
  106. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +10 -2
  107. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +2 -0
  108. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +5 -8
  109. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +5 -8
  110. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +2 -0
  111. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +71 -0
  112. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +14 -0
  113. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +23 -0
  114. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +26 -1
  115. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +105 -34
  116. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +16 -8
  117. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -0
  118. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +4 -8
  119. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -8
  120. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +282 -25
  121. package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
  122. package/deps/rocksdb/rocksdb/env/io_posix.cc +3 -1
  123. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +367 -177
  124. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +144 -56
  125. package/deps/rocksdb/rocksdb/file/filename.cc +3 -3
  126. package/deps/rocksdb/rocksdb/file/filename.h +4 -2
  127. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +415 -0
  128. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +2 -0
  129. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +36 -45
  130. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +21 -3
  131. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +11 -11
  132. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +15 -1
  133. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +163 -68
  134. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +26 -12
  135. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +23 -5
  136. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +21 -17
  137. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +17 -0
  138. package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +3 -3
  139. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +17 -6
  140. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
  141. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -0
  142. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +3 -3
  143. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +4 -0
  144. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  145. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +3 -0
  146. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -1
  147. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +2 -1
  148. package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -2
  149. package/deps/rocksdb/rocksdb/monitoring/histogram.cc +4 -2
  150. package/deps/rocksdb/rocksdb/monitoring/histogram.h +2 -0
  151. package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +15 -1
  152. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +17 -0
  153. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +14 -3
  154. package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +3 -0
  155. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +50 -0
  156. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +1 -0
  157. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +31 -32
  158. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -1
  159. package/deps/rocksdb/rocksdb/options/options.cc +2 -2
  160. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -1
  161. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -0
  162. package/deps/rocksdb/rocksdb/src.mk +4 -2
  163. package/deps/rocksdb/rocksdb/table/block_based/block.h +9 -8
  164. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +110 -99
  165. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +12 -10
  166. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +11 -2
  167. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +138 -83
  168. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +25 -24
  169. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +31 -30
  170. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -13
  171. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +4 -4
  172. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -3
  173. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +3 -3
  174. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +17 -19
  175. package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
  176. package/deps/rocksdb/rocksdb/table/format.cc +26 -29
  177. package/deps/rocksdb/rocksdb/table/format.h +44 -26
  178. package/deps/rocksdb/rocksdb/table/get_context.cc +17 -12
  179. package/deps/rocksdb/rocksdb/table/internal_iterator.h +7 -0
  180. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +4 -0
  181. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +950 -104
  182. package/deps/rocksdb/rocksdb/table/merging_iterator.h +28 -1
  183. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +3 -2
  184. package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -1
  185. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +10 -9
  186. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +22 -20
  187. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +1 -1
  188. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +1 -1
  189. package/deps/rocksdb/rocksdb/table/table_builder.h +9 -21
  190. package/deps/rocksdb/rocksdb/table/table_test.cc +12 -12
  191. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +4 -4
  192. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +1 -0
  193. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +116 -34
  194. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +6 -1
  195. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +1 -1
  196. package/deps/rocksdb/rocksdb/util/autovector.h +12 -0
  197. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +3 -2
  198. package/deps/rocksdb/rocksdb/util/stderr_logger.cc +30 -0
  199. package/deps/rocksdb/rocksdb/util/stderr_logger.h +5 -18
  200. package/deps/rocksdb/rocksdb/util/timer.h +2 -3
  201. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +9 -2
  202. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +1 -1
  203. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +1 -1
  204. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +34 -53
  205. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +9 -14
  206. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -4
  207. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +4 -0
  208. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +1 -1
  209. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +4 -3
  210. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +3 -1
  211. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +26 -8
  212. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +114 -16
  213. package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
  214. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +59 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +3 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +39 -0
  217. package/deps/rocksdb/rocksdb.gyp +0 -1
  218. package/index.js +6 -10
  219. package/package.json +1 -1
  220. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  221. package/prebuilds/linux-x64/node.napi.node +0 -0
  222. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +0 -168
  223. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +0 -90
@@ -20,24 +20,42 @@
20
20
  #include "rocksdb/file_system.h"
21
21
  #include "rocksdb/options.h"
22
22
  #include "util/aligned_buffer.h"
23
+ #include "util/autovector.h"
24
+ #include "util/stop_watch.h"
23
25
 
24
26
  namespace ROCKSDB_NAMESPACE {
25
27
 
26
- #define DEAFULT_DECREMENT 8 * 1024
28
+ #define DEFAULT_DECREMENT 8 * 1024
27
29
 
28
30
  struct IOOptions;
29
31
  class RandomAccessFileReader;
30
32
 
31
33
  struct BufferInfo {
32
34
  AlignedBuffer buffer_;
35
+
33
36
  uint64_t offset_ = 0;
37
+
38
+ // Below parameters are used in case of async read flow.
39
+ // Length requested for in ReadAsync.
40
+ size_t async_req_len_ = 0;
41
+
42
+ // async_read_in_progress can be used as mutex. Callback can update the buffer
43
+ // and its size but async_read_in_progress is only set by main thread.
44
+ bool async_read_in_progress_ = false;
45
+
46
+ // io_handle is allocated and used by underlying file system in case of
47
+ // asynchronous reads.
48
+ void* io_handle_ = nullptr;
49
+
50
+ IOHandleDeleter del_fn_ = nullptr;
51
+
52
+ // pos represents the index of this buffer in vector of BufferInfo.
53
+ uint32_t pos_ = 0;
34
54
  };
35
55
 
36
56
  // FilePrefetchBuffer is a smart buffer to store and read data from a file.
37
57
  class FilePrefetchBuffer {
38
58
  public:
39
- static const int kMinNumFileReadsToStartAutoReadahead = 2;
40
-
41
59
  // Constructor.
42
60
  //
43
61
  // All arguments are optional.
@@ -54,9 +72,6 @@ class FilePrefetchBuffer {
54
72
  // it. Used for adaptable readahead of the file footer/metadata.
55
73
  // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
56
74
  // doing sequential scans for two times.
57
- // async_io : When async_io is enabled, if it's implicit_auto_readahead, it
58
- // prefetches data asynchronously in second buffer while curr_ is being
59
- // consumed.
60
75
  //
61
76
  // Automatic readhead is enabled for a file if readahead_size
62
77
  // and max_readahead_size are passed in.
@@ -65,8 +80,10 @@ class FilePrefetchBuffer {
65
80
  FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
66
81
  bool enable = true, bool track_min_offset = false,
67
82
  bool implicit_auto_readahead = false,
68
- uint64_t num_file_reads = 0, FileSystem* fs = nullptr,
69
- SystemClock* clock = nullptr, Statistics* stats = nullptr)
83
+ uint64_t num_file_reads = 0,
84
+ uint64_t num_file_reads_for_auto_readahead = 0,
85
+ FileSystem* fs = nullptr, SystemClock* clock = nullptr,
86
+ Statistics* stats = nullptr)
70
87
  : curr_(0),
71
88
  readahead_size_(readahead_size),
72
89
  initial_auto_readahead_size_(readahead_size),
@@ -77,47 +94,83 @@ class FilePrefetchBuffer {
77
94
  implicit_auto_readahead_(implicit_auto_readahead),
78
95
  prev_offset_(0),
79
96
  prev_len_(0),
97
+ num_file_reads_for_auto_readahead_(num_file_reads_for_auto_readahead),
80
98
  num_file_reads_(num_file_reads),
81
- io_handle_(nullptr),
82
- del_fn_(nullptr),
83
- async_read_in_progress_(false),
84
- async_request_submitted_(false),
99
+ explicit_prefetch_submitted_(false),
85
100
  fs_(fs),
86
101
  clock_(clock),
87
102
  stats_(stats) {
88
- assert((num_file_reads_ >= kMinNumFileReadsToStartAutoReadahead + 1) ||
103
+ assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
89
104
  (num_file_reads_ == 0));
90
- // If async_io_ is enabled, data is asynchronously filled in second buffer
91
- // while curr_ is being consumed. If data is overlapping in two buffers,
92
- // data is copied to third buffer to return continuous buffer.
105
+ // If ReadOptions.async_io is enabled, data is asynchronously filled in
106
+ // second buffer while curr_ is being consumed. If data is overlapping in
107
+ // two buffers, data is copied to third buffer to return continuous buffer.
93
108
  bufs_.resize(3);
109
+ for (uint32_t i = 0; i < 2; i++) {
110
+ bufs_[i].pos_ = i;
111
+ }
94
112
  }
95
113
 
96
114
  ~FilePrefetchBuffer() {
97
115
  // Abort any pending async read request before destroying the class object.
98
- if (async_read_in_progress_ && fs_ != nullptr) {
116
+ if (fs_ != nullptr) {
99
117
  std::vector<void*> handles;
100
- handles.emplace_back(io_handle_);
101
- Status s = fs_->AbortIO(handles);
102
- assert(s.ok());
118
+ for (uint32_t i = 0; i < 2; i++) {
119
+ if (bufs_[i].async_read_in_progress_ &&
120
+ bufs_[i].io_handle_ != nullptr) {
121
+ handles.emplace_back(bufs_[i].io_handle_);
122
+ }
123
+ }
124
+ if (!handles.empty()) {
125
+ StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
126
+ Status s = fs_->AbortIO(handles);
127
+ assert(s.ok());
128
+ }
103
129
  }
104
130
 
105
131
  // Prefetch buffer bytes discarded.
106
132
  uint64_t bytes_discarded = 0;
107
- if (bufs_[curr_].buffer_.CurrentSize() != 0) {
108
- bytes_discarded = bufs_[curr_].buffer_.CurrentSize();
109
- }
110
- if (bufs_[curr_ ^ 1].buffer_.CurrentSize() != 0) {
111
- bytes_discarded += bufs_[curr_ ^ 1].buffer_.CurrentSize();
133
+ // Iterated over 2 buffers.
134
+ for (int i = 0; i < 2; i++) {
135
+ int first = i;
136
+ int second = i ^ 1;
137
+
138
+ if (DoesBufferContainData(first)) {
139
+ // If last block was read completely from first and some bytes in
140
+ // first buffer are still unconsumed.
141
+ if (prev_offset_ >= bufs_[first].offset_ &&
142
+ prev_offset_ + prev_len_ <
143
+ bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) {
144
+ bytes_discarded += bufs_[first].buffer_.CurrentSize() -
145
+ (prev_offset_ + prev_len_ - bufs_[first].offset_);
146
+ }
147
+ // If data was in second buffer and some/whole block bytes were read
148
+ // from second buffer.
149
+ else if (prev_offset_ < bufs_[first].offset_ &&
150
+ !DoesBufferContainData(second)) {
151
+ // If last block read was completely from different buffer, this
152
+ // buffer is unconsumed.
153
+ if (prev_offset_ + prev_len_ <= bufs_[first].offset_) {
154
+ bytes_discarded += bufs_[first].buffer_.CurrentSize();
155
+ }
156
+ // If last block read overlaps with this buffer and some data is
157
+ // still unconsumed and previous buffer (second) is not cleared.
158
+ else if (prev_offset_ + prev_len_ > bufs_[first].offset_ &&
159
+ bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() ==
160
+ bufs_[second].offset_) {
161
+ bytes_discarded += bufs_[first].buffer_.CurrentSize() -
162
+ (/*bytes read from this buffer=*/prev_len_ -
163
+ (bufs_[first].offset_ - prev_offset_));
164
+ }
165
+ }
166
+ }
112
167
  }
113
- RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
114
168
 
115
- // Release io_handle_.
116
- if (io_handle_ != nullptr && del_fn_ != nullptr) {
117
- del_fn_(io_handle_);
118
- io_handle_ = nullptr;
119
- del_fn_ = nullptr;
169
+ for (uint32_t i = 0; i < 2; i++) {
170
+ // Release io_handle.
171
+ DestroyAndClearIOHandle(i);
120
172
  }
173
+ RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
121
174
  }
122
175
 
123
176
  // Load data into the buffer from a file.
@@ -126,9 +179,6 @@ class FilePrefetchBuffer {
126
179
  // n : the number of bytes to read.
127
180
  // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
128
181
  // bypass.
129
- // is_async_read : if the data should be prefetched by calling read
130
- // asynchronously. It should be set true when called
131
- // from TryReadFromCache.
132
182
  Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader,
133
183
  uint64_t offset, size_t n,
134
184
  Env::IOPriority rate_limiter_priority);
@@ -194,7 +244,7 @@ class FilePrefetchBuffer {
194
244
  }
195
245
 
196
246
  void DecreaseReadAheadIfEligible(uint64_t offset, size_t size,
197
- size_t value = DEAFULT_DECREMENT) {
247
+ size_t value = DEFAULT_DECREMENT) {
198
248
  // Decrease the readahead_size if
199
249
  // - its enabled internally by RocksDB (implicit_auto_readahead_) and,
200
250
  // - readahead_size is greater than 0 and,
@@ -203,12 +253,14 @@ class FilePrefetchBuffer {
203
253
  // - few/no bytes are in buffer and,
204
254
  // - block is sequential with the previous read and,
205
255
  // - num_file_reads_ + 1 (including this read) >
206
- // kMinNumFileReadsToStartAutoReadahead
256
+ // num_file_reads_for_auto_readahead_
257
+ size_t curr_size = bufs_[curr_].async_read_in_progress_
258
+ ? bufs_[curr_].async_req_len_
259
+ : bufs_[curr_].buffer_.CurrentSize();
207
260
  if (implicit_auto_readahead_ && readahead_size_ > 0) {
208
- if ((offset + size >
209
- bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) &&
261
+ if ((offset + size > bufs_[curr_].offset_ + curr_size) &&
210
262
  IsBlockSequential(offset) &&
211
- (num_file_reads_ + 1 > kMinNumFileReadsToStartAutoReadahead)) {
263
+ (num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) {
212
264
  readahead_size_ =
213
265
  std::max(initial_auto_readahead_size_,
214
266
  (readahead_size_ >= value ? readahead_size_ - value : 0));
@@ -224,8 +276,14 @@ class FilePrefetchBuffer {
224
276
  // and data present in buffer_. It also allocates new buffer or refit tail if
225
277
  // required.
226
278
  void CalculateOffsetAndLen(size_t alignment, uint64_t offset,
227
- size_t roundup_len, size_t index, bool refit_tail,
228
- uint64_t& chunk_len);
279
+ size_t roundup_len, uint32_t index,
280
+ bool refit_tail, uint64_t& chunk_len);
281
+
282
+ void AbortIOIfNeeded(uint64_t offset);
283
+
284
+ void AbortAllIOs();
285
+
286
+ void UpdateBuffersIfNeeded(uint64_t offset);
229
287
 
230
288
  // It calls Poll API if any there is any pending asynchronous request. It then
231
289
  // checks if data is in any buffer. It clears the outdated data and swaps the
@@ -243,8 +301,7 @@ class FilePrefetchBuffer {
243
301
  uint64_t chunk_len, uint64_t rounddown_start, uint32_t index);
244
302
 
245
303
  Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader,
246
- uint64_t read_len, uint64_t chunk_len,
247
- uint64_t rounddown_start, uint32_t index);
304
+ uint64_t read_len, uint64_t rounddown_start, uint32_t index);
248
305
 
249
306
  // Copy the data from src to third buffer.
250
307
  void CopyDataToBuffer(uint32_t src, uint64_t& offset, size_t& length);
@@ -273,25 +330,60 @@ class FilePrefetchBuffer {
273
330
  // Since async request was submitted in last call directly by calling
274
331
  // PrefetchAsync, it skips num_file_reads_ check as this call is to poll the
275
332
  // data submitted in previous call.
276
- if (async_request_submitted_) {
333
+ if (explicit_prefetch_submitted_) {
277
334
  return true;
278
335
  }
279
- if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) {
336
+ if (num_file_reads_ <= num_file_reads_for_auto_readahead_) {
280
337
  UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
281
338
  return false;
282
339
  }
283
340
  return true;
284
341
  }
285
342
 
343
+ // Helper functions.
344
+ bool IsDataBlockInBuffer(uint64_t offset, size_t length, uint32_t index) {
345
+ return (offset >= bufs_[index].offset_ &&
346
+ offset + length <=
347
+ bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
348
+ }
349
+ bool IsOffsetInBuffer(uint64_t offset, uint32_t index) {
350
+ return (offset >= bufs_[index].offset_ &&
351
+ offset < bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
352
+ }
353
+ bool DoesBufferContainData(uint32_t index) {
354
+ return bufs_[index].buffer_.CurrentSize() > 0;
355
+ }
356
+ bool IsBufferOutdated(uint64_t offset, uint32_t index) {
357
+ return (
358
+ !bufs_[index].async_read_in_progress_ && DoesBufferContainData(index) &&
359
+ offset >= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
360
+ }
361
+ bool IsBufferOutdatedWithAsyncProgress(uint64_t offset, uint32_t index) {
362
+ return (bufs_[index].async_read_in_progress_ &&
363
+ bufs_[index].io_handle_ != nullptr &&
364
+ offset >= bufs_[index].offset_ + bufs_[index].async_req_len_);
365
+ }
366
+
367
+ void DestroyAndClearIOHandle(uint32_t index) {
368
+ if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) {
369
+ bufs_[index].del_fn_(bufs_[index].io_handle_);
370
+ bufs_[index].io_handle_ = nullptr;
371
+ bufs_[index].del_fn_ = nullptr;
372
+ }
373
+ bufs_[index].async_read_in_progress_ = false;
374
+ }
375
+
286
376
  std::vector<BufferInfo> bufs_;
287
377
  // curr_ represents the index for bufs_ indicating which buffer is being
288
378
  // consumed currently.
289
379
  uint32_t curr_;
380
+
290
381
  size_t readahead_size_;
291
382
  size_t initial_auto_readahead_size_;
292
383
  // FilePrefetchBuffer object won't be created from Iterator flow if
293
384
  // max_readahead_size_ = 0.
294
385
  size_t max_readahead_size_;
386
+
295
387
  // The minimum `offset` ever passed to TryReadFromCache().
296
388
  size_t min_offset_read_;
297
389
  // if false, TryReadFromCache() always return false, and we only take stats
@@ -306,20 +398,16 @@ class FilePrefetchBuffer {
306
398
  bool implicit_auto_readahead_;
307
399
  uint64_t prev_offset_;
308
400
  size_t prev_len_;
309
- // num_file_reads_ is only used when implicit_auto_readahead_ is set.
401
+ // num_file_reads_ and num_file_reads_for_auto_readahead_ is only used when
402
+ // implicit_auto_readahead_ is set.
403
+ uint64_t num_file_reads_for_auto_readahead_;
310
404
  uint64_t num_file_reads_;
311
405
 
312
- // io_handle_ is allocated and used by underlying file system in case of
313
- // asynchronous reads.
314
- void* io_handle_;
315
- IOHandleDeleter del_fn_;
316
- bool async_read_in_progress_;
317
-
318
- // If async_request_submitted_ is set then it indicates RocksDB called
319
- // PrefetchAsync to submit request. It needs to TryReadFromCacheAsync to poll
320
- // the submitted request without checking if data is sequential and
406
+ // If explicit_prefetch_submitted_ is set then it indicates RocksDB called
407
+ // PrefetchAsync to submit request. It needs to call TryReadFromCacheAsync to
408
+ // poll the submitted request without checking if data is sequential and
321
409
  // num_file_reads_.
322
- bool async_request_submitted_;
410
+ bool explicit_prefetch_submitted_;
323
411
 
324
412
  FileSystem* fs_;
325
413
  SystemClock* clock_;
@@ -388,7 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
388
388
 
389
389
  IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
390
390
  uint64_t descriptor_number,
391
- FSDirectory* directory_to_fsync) {
391
+ FSDirectory* dir_contains_current_file) {
392
392
  // Remove leading "dbname/" and add newline to manifest file name
393
393
  std::string manifest = DescriptorFileName(dbname, descriptor_number);
394
394
  Slice contents = manifest;
@@ -404,8 +404,8 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
404
404
  TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s);
405
405
  }
406
406
  if (s.ok()) {
407
- if (directory_to_fsync != nullptr) {
408
- s = directory_to_fsync->FsyncWithDirOptions(
407
+ if (dir_contains_current_file != nullptr) {
408
+ s = dir_contains_current_file->FsyncWithDirOptions(
409
409
  IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname)));
410
410
  }
411
411
  } else {
@@ -160,10 +160,12 @@ extern bool ParseFileName(const std::string& filename, uint64_t* number,
160
160
  FileType* type, WalFileType* log_type = nullptr);
161
161
 
162
162
  // Make the CURRENT file point to the descriptor file with the
163
- // specified number.
163
+ // specified number. On its success and when dir_contains_current_file is not
164
+ // nullptr, the function will fsync the directory containing the CURRENT file
165
+ // when
164
166
  extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
165
167
  uint64_t descriptor_number,
166
- FSDirectory* directory_to_fsync);
168
+ FSDirectory* dir_contains_current_file);
167
169
 
168
170
  // Make the IDENTITY file for the db
169
171
  extern Status SetIdentityFile(Env* env, const std::string& dbname,