@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/binding.cc +32 -2
  2. package/binding.gyp +8 -0
  3. package/deps/liburing/liburing.gyp +20 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +7 -0
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
  8. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
  10. package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
  12. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
  19. package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
  20. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
  23. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
  25. package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
  26. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
  30. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
  31. package/deps/rocksdb/rocksdb/db/c.cc +90 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
  33. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  34. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
  42. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
  43. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
  44. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
  45. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
  52. package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
  53. package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
  54. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
  55. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
  57. package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
  58. package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
  59. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
  61. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
  62. package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
  63. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  64. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
  65. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
  66. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
  67. package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
  68. package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
  69. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  70. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  71. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
  72. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
  73. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
  74. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
  75. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
  76. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
  78. package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
  80. package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
  82. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
  89. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
  94. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
  95. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
  96. package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
  97. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
  98. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
  99. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
  100. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
  101. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
  102. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
  104. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
  105. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
  106. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
  108. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
  109. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
  110. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  113. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
  114. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
  115. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
  116. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
  117. package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
  118. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
  119. package/deps/rocksdb/rocksdb/src.mk +4 -0
  120. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
  121. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
  122. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
  124. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
  125. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
  126. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  128. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
  129. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  131. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
  132. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
  133. package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
  134. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
  135. package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
  136. package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
  137. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
  138. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
  139. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
  140. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
  141. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
  143. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
  144. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
  145. package/deps/rocksdb/rocksdb.gyp +7 -1
  146. package/package.json +1 -1
  147. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -101,12 +101,9 @@ class NonBatchedOpsStressTest : public StressTest {
101
101
  if (diff > 0) {
102
102
  s = Status::NotFound();
103
103
  } else if (diff == 0) {
104
- const WideColumns expected_columns = GenerateExpectedWideColumns(
105
- GetValueBase(iter->value()), iter->value());
106
- if (iter->columns() != expected_columns) {
104
+ if (!VerifyWideColumns(iter->value(), iter->columns())) {
107
105
  VerificationAbort(shared, static_cast<int>(cf), i,
108
- iter->value(), iter->columns(),
109
- expected_columns);
106
+ iter->value(), iter->columns());
110
107
  }
111
108
 
112
109
  from_db = iter->value().ToString();
@@ -159,26 +156,24 @@ class NonBatchedOpsStressTest : public StressTest {
159
156
  }
160
157
 
161
158
  const std::string key = Key(i);
162
- PinnableWideColumns columns;
159
+ PinnableWideColumns result;
163
160
 
164
161
  Status s =
165
- db_->GetEntity(options, column_families_[cf], key, &columns);
162
+ db_->GetEntity(options, column_families_[cf], key, &result);
166
163
 
167
164
  std::string from_db;
168
165
 
169
166
  if (s.ok()) {
170
- const WideColumns& columns_from_db = columns.columns();
167
+ const WideColumns& columns = result.columns();
171
168
 
172
- if (!columns_from_db.empty() &&
173
- columns_from_db[0].name() == kDefaultWideColumnName) {
174
- from_db = columns_from_db[0].value().ToString();
169
+ if (!columns.empty() &&
170
+ columns.front().name() == kDefaultWideColumnName) {
171
+ from_db = columns.front().value().ToString();
175
172
  }
176
173
 
177
- const WideColumns expected_columns =
178
- GenerateExpectedWideColumns(GetValueBase(from_db), from_db);
179
- if (columns_from_db != expected_columns) {
174
+ if (!VerifyWideColumns(columns)) {
180
175
  VerificationAbort(shared, static_cast<int>(cf), i, from_db,
181
- columns_from_db, expected_columns);
176
+ columns);
182
177
  }
183
178
  }
184
179
 
@@ -256,18 +251,16 @@ class NonBatchedOpsStressTest : public StressTest {
256
251
  std::string from_db;
257
252
 
258
253
  if (statuses[j].ok()) {
259
- const WideColumns& columns_from_db = results[j].columns();
254
+ const WideColumns& columns = results[j].columns();
260
255
 
261
- if (!columns_from_db.empty() &&
262
- columns_from_db[0].name() == kDefaultWideColumnName) {
263
- from_db = columns_from_db[0].value().ToString();
256
+ if (!columns.empty() &&
257
+ columns.front().name() == kDefaultWideColumnName) {
258
+ from_db = columns.front().value().ToString();
264
259
  }
265
260
 
266
- const WideColumns expected_columns =
267
- GenerateExpectedWideColumns(GetValueBase(from_db), from_db);
268
- if (columns_from_db != expected_columns) {
261
+ if (!VerifyWideColumns(columns)) {
269
262
  VerificationAbort(shared, static_cast<int>(cf), i, from_db,
270
- columns_from_db, expected_columns);
263
+ columns);
271
264
  }
272
265
  }
273
266
 
@@ -492,6 +485,11 @@ class NonBatchedOpsStressTest : public StressTest {
492
485
  ReadOptions read_opts_copy = read_opts;
493
486
  std::string read_ts_str;
494
487
  Slice read_ts_slice;
488
+ if (FLAGS_user_timestamp_size > 0) {
489
+ read_ts_str = GetNowNanos();
490
+ read_ts_slice = read_ts_str;
491
+ read_opts_copy.timestamp = &read_ts_slice;
492
+ }
495
493
  bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
496
494
  thread, read_ts_str, read_ts_slice, read_opts_copy);
497
495
 
@@ -514,7 +512,7 @@ class NonBatchedOpsStressTest : public StressTest {
514
512
  // found case
515
513
  thread->stats.AddGets(1, 1);
516
514
  // we only have the latest expected state
517
- if (!FLAGS_skip_verifydb && !read_opts_copy.timestamp &&
515
+ if (!FLAGS_skip_verifydb && !read_older_ts &&
518
516
  thread->shared->Get(rand_column_families[0], rand_keys[0]) ==
519
517
  SharedState::DELETION_SENTINEL) {
520
518
  thread->shared->SetVerificationFailure();
@@ -751,6 +749,104 @@ class NonBatchedOpsStressTest : public StressTest {
751
749
  return statuses;
752
750
  }
753
751
 
752
+ void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts,
753
+ const std::vector<int>& rand_column_families,
754
+ const std::vector<int64_t>& rand_keys) override {
755
+ if (fault_fs_guard) {
756
+ fault_fs_guard->EnableErrorInjection();
757
+ SharedState::ignore_read_error = false;
758
+ }
759
+
760
+ assert(thread);
761
+
762
+ SharedState* const shared = thread->shared;
763
+ assert(shared);
764
+
765
+ assert(!rand_column_families.empty());
766
+ assert(!rand_keys.empty());
767
+
768
+ std::unique_ptr<MutexLock> lock(new MutexLock(
769
+ shared->GetMutexForKey(rand_column_families[0], rand_keys[0])));
770
+
771
+ assert(rand_column_families[0] >= 0);
772
+ assert(rand_column_families[0] < static_cast<int>(column_families_.size()));
773
+
774
+ ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
775
+ assert(cfh);
776
+
777
+ const std::string key = Key(rand_keys[0]);
778
+
779
+ PinnableWideColumns from_db;
780
+
781
+ const Status s = db_->GetEntity(read_opts, cfh, key, &from_db);
782
+
783
+ int error_count = 0;
784
+
785
+ if (fault_fs_guard) {
786
+ error_count = fault_fs_guard->GetAndResetErrorCount();
787
+ }
788
+
789
+ if (s.ok()) {
790
+ if (fault_fs_guard) {
791
+ if (error_count && !SharedState::ignore_read_error) {
792
+ // Grab mutex so multiple threads don't try to print the
793
+ // stack trace at the same time
794
+ MutexLock l(shared->GetMutex());
795
+ fprintf(stderr, "Didn't get expected error from GetEntity\n");
796
+ fprintf(stderr, "Call stack that injected the fault\n");
797
+ fault_fs_guard->PrintFaultBacktrace();
798
+ std::terminate();
799
+ }
800
+ }
801
+
802
+ thread->stats.AddGets(1, 1);
803
+
804
+ if (!FLAGS_skip_verifydb) {
805
+ const WideColumns& columns = from_db.columns();
806
+
807
+ if (!VerifyWideColumns(columns)) {
808
+ shared->SetVerificationFailure();
809
+ fprintf(stderr,
810
+ "error : inconsistent columns returned by GetEntity for key "
811
+ "%s: %s\n",
812
+ StringToHex(key).c_str(), WideColumnsToHex(columns).c_str());
813
+ } else if (shared->Get(rand_column_families[0], rand_keys[0]) ==
814
+ SharedState::DELETION_SENTINEL) {
815
+ shared->SetVerificationFailure();
816
+ fprintf(
817
+ stderr,
818
+ "error : inconsistent values for key %s: GetEntity returns %s, "
819
+ "expected state does not have the key.\n",
820
+ StringToHex(key).c_str(), WideColumnsToHex(columns).c_str());
821
+ }
822
+ }
823
+ } else if (s.IsNotFound()) {
824
+ thread->stats.AddGets(1, 0);
825
+
826
+ if (!FLAGS_skip_verifydb) {
827
+ auto expected = shared->Get(rand_column_families[0], rand_keys[0]);
828
+ if (expected != SharedState::DELETION_SENTINEL &&
829
+ expected != SharedState::UNKNOWN_SENTINEL) {
830
+ shared->SetVerificationFailure();
831
+ fprintf(stderr,
832
+ "error : inconsistent values for key %s: expected state has "
833
+ "the key, GetEntity returns NotFound.\n",
834
+ StringToHex(key).c_str());
835
+ }
836
+ }
837
+ } else {
838
+ if (error_count == 0) {
839
+ thread->stats.AddErrors(1);
840
+ } else {
841
+ thread->stats.AddVerifiedErrors(1);
842
+ }
843
+ }
844
+
845
+ if (fault_fs_guard) {
846
+ fault_fs_guard->DisableErrorInjection();
847
+ }
848
+ }
849
+
754
850
  Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
755
851
  const std::vector<int>& rand_column_families,
756
852
  const std::vector<int64_t>& rand_keys) override {
@@ -805,12 +901,9 @@ class NonBatchedOpsStressTest : public StressTest {
805
901
  }
806
902
  }
807
903
 
808
- const WideColumns expected_columns = GenerateExpectedWideColumns(
809
- GetValueBase(iter->value()), iter->value());
810
- if (iter->columns() != expected_columns) {
811
- s = Status::Corruption(
812
- "Value and columns inconsistent",
813
- DebugString(iter->value(), iter->columns(), expected_columns));
904
+ if (!VerifyWideColumns(iter->value(), iter->columns())) {
905
+ s = Status::Corruption("Value and columns inconsistent",
906
+ DebugString(iter->value(), iter->columns()));
814
907
  break;
815
908
  }
816
909
  }
@@ -1263,17 +1356,15 @@ class NonBatchedOpsStressTest : public StressTest {
1263
1356
  assert(iter);
1264
1357
  assert(iter->Valid());
1265
1358
 
1266
- const WideColumns expected_columns = GenerateExpectedWideColumns(
1267
- GetValueBase(iter->value()), iter->value());
1268
- if (iter->columns() != expected_columns) {
1359
+ if (!VerifyWideColumns(iter->value(), iter->columns())) {
1269
1360
  shared->SetVerificationFailure();
1270
1361
 
1271
1362
  fprintf(stderr,
1272
1363
  "Verification failed for key %s: "
1273
- "Value and columns inconsistent: %s\n",
1364
+ "Value and columns inconsistent: value: %s, columns: %s\n",
1274
1365
  Slice(iter->key()).ToString(/* hex */ true).c_str(),
1275
- DebugString(iter->value(), iter->columns(), expected_columns)
1276
- .c_str());
1366
+ iter->value().ToString(/* hex */ true).c_str(),
1367
+ WideColumnsToHex(iter->columns()).c_str());
1277
1368
  fprintf(stderr, "Column family: %s, op_logs: %s\n",
1278
1369
  cfh->GetName().c_str(), op_logs.c_str());
1279
1370
 
@@ -1183,6 +1183,14 @@ class PosixFileSystem : public FileSystem {
1183
1183
  #endif
1184
1184
  }
1185
1185
 
1186
+ bool use_async_io() override {
1187
+ #if defined(ROCKSDB_IOURING_PRESENT)
1188
+ return IsIOUringEnabled();
1189
+ #else
1190
+ return false;
1191
+ #endif
1192
+ }
1193
+
1186
1194
  #if defined(ROCKSDB_IOURING_PRESENT)
1187
1195
  // io_uring instance
1188
1196
  std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
@@ -162,6 +162,9 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
162
162
 
163
163
  Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len,
164
164
  rounddown_offset, curr_);
165
+ if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) {
166
+ RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len);
167
+ }
165
168
  return s;
166
169
  }
167
170
 
@@ -609,6 +612,22 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
609
612
  Slice* result, Status* status,
610
613
  Env::IOPriority rate_limiter_priority,
611
614
  bool for_compaction /* = false */) {
615
+ bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status,
616
+ rate_limiter_priority, for_compaction);
617
+ if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) {
618
+ if (ret) {
619
+ RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT);
620
+ } else {
621
+ RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS);
622
+ }
623
+ }
624
+ return ret;
625
+ }
626
+
627
+ bool FilePrefetchBuffer::TryReadFromCacheUntracked(
628
+ const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
629
+ size_t n, Slice* result, Status* status,
630
+ Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) {
612
631
  if (track_min_offset_ && offset < min_offset_read_) {
613
632
  min_offset_read_ = static_cast<size_t>(offset);
614
633
  }
@@ -666,6 +685,22 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
666
685
  const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
667
686
  size_t n, Slice* result, Status* status,
668
687
  Env::IOPriority rate_limiter_priority) {
688
+ bool ret = TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result,
689
+ status, rate_limiter_priority);
690
+ if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) {
691
+ if (ret) {
692
+ RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT);
693
+ } else {
694
+ RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS);
695
+ }
696
+ }
697
+ return ret;
698
+ }
699
+
700
+ bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked(
701
+ const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
702
+ size_t n, Slice* result, Status* status,
703
+ Env::IOPriority rate_limiter_priority) {
669
704
  if (track_min_offset_ && offset < min_offset_read_) {
670
705
  min_offset_read_ = static_cast<size_t>(offset);
671
706
  }
@@ -54,6 +54,11 @@ struct BufferInfo {
54
54
  uint32_t pos_ = 0;
55
55
  };
56
56
 
57
+ enum class FilePrefetchBufferUsage {
58
+ kTableOpenPrefetchTail,
59
+ kUnknown,
60
+ };
61
+
57
62
  // FilePrefetchBuffer is a smart buffer to store and read data from a file.
58
63
  class FilePrefetchBuffer {
59
64
  public:
@@ -78,13 +83,13 @@ class FilePrefetchBuffer {
78
83
  // and max_readahead_size are passed in.
79
84
  // A user can construct a FilePrefetchBuffer without any arguments, but use
80
85
  // `Prefetch` to load data into the buffer.
81
- FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
82
- bool enable = true, bool track_min_offset = false,
83
- bool implicit_auto_readahead = false,
84
- uint64_t num_file_reads = 0,
85
- uint64_t num_file_reads_for_auto_readahead = 0,
86
- FileSystem* fs = nullptr, SystemClock* clock = nullptr,
87
- Statistics* stats = nullptr)
86
+ FilePrefetchBuffer(
87
+ size_t readahead_size = 0, size_t max_readahead_size = 0,
88
+ bool enable = true, bool track_min_offset = false,
89
+ bool implicit_auto_readahead = false, uint64_t num_file_reads = 0,
90
+ uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr,
91
+ SystemClock* clock = nullptr, Statistics* stats = nullptr,
92
+ FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown)
88
93
  : curr_(0),
89
94
  readahead_size_(readahead_size),
90
95
  initial_auto_readahead_size_(readahead_size),
@@ -100,7 +105,8 @@ class FilePrefetchBuffer {
100
105
  explicit_prefetch_submitted_(false),
101
106
  fs_(fs),
102
107
  clock_(clock),
103
- stats_(stats) {
108
+ stats_(stats),
109
+ usage_(usage) {
104
110
  assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
105
111
  (num_file_reads_ == 0));
106
112
  // If ReadOptions.async_io is enabled, data is asynchronously filled in
@@ -403,6 +409,19 @@ class FilePrefetchBuffer {
403
409
  bool& copy_to_third_buffer, uint64_t& tmp_offset,
404
410
  size_t& tmp_length);
405
411
 
412
+ bool TryReadFromCacheUntracked(const IOOptions& opts,
413
+ RandomAccessFileReader* reader,
414
+ uint64_t offset, size_t n, Slice* result,
415
+ Status* s,
416
+ Env::IOPriority rate_limiter_priority,
417
+ bool for_compaction = false);
418
+
419
+ bool TryReadFromCacheAsyncUntracked(const IOOptions& opts,
420
+ RandomAccessFileReader* reader,
421
+ uint64_t offset, size_t n, Slice* result,
422
+ Status* status,
423
+ Env::IOPriority rate_limiter_priority);
424
+
406
425
  std::vector<BufferInfo> bufs_;
407
426
  // curr_ represents the index for bufs_ indicating which buffer is being
408
427
  // consumed currently.
@@ -442,5 +461,7 @@ class FilePrefetchBuffer {
442
461
  FileSystem* fs_;
443
462
  SystemClock* clock_;
444
463
  Statistics* stats_;
464
+
465
+ FilePrefetchBufferUsage usage_;
445
466
  };
446
467
  } // namespace ROCKSDB_NAMESPACE
@@ -135,7 +135,7 @@ IOStatus GenerateOneFileChecksum(
135
135
  FileChecksumGenFactory* checksum_factory,
136
136
  const std::string& requested_checksum_func_name, std::string* file_checksum,
137
137
  std::string* file_checksum_func_name,
138
- size_t verify_checksums_readahead_size, bool allow_mmap_reads,
138
+ size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/,
139
139
  std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
140
140
  Env::IOPriority rate_limiter_priority) {
141
141
  if (checksum_factory == nullptr) {
@@ -196,10 +196,12 @@ IOStatus GenerateOneFileChecksum(
196
196
  size_t readahead_size = (verify_checksums_readahead_size != 0)
197
197
  ? verify_checksums_readahead_size
198
198
  : default_max_read_ahead_size;
199
-
200
- FilePrefetchBuffer prefetch_buffer(readahead_size /* readahead_size */,
201
- readahead_size /* max_readahead_size */,
202
- !allow_mmap_reads /* enable */);
199
+ std::unique_ptr<char[]> buf;
200
+ if (reader->use_direct_io()) {
201
+ size_t alignment = reader->file()->GetRequiredBufferAlignment();
202
+ readahead_size = (readahead_size + alignment - 1) & ~(alignment - 1);
203
+ }
204
+ buf.reset(new char[readahead_size]);
203
205
 
204
206
  Slice slice;
205
207
  uint64_t offset = 0;
@@ -207,11 +209,11 @@ IOStatus GenerateOneFileChecksum(
207
209
  while (size > 0) {
208
210
  size_t bytes_to_read =
209
211
  static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
210
- if (!prefetch_buffer.TryReadFromCache(
211
- opts, reader.get(), offset, bytes_to_read, &slice,
212
- nullptr /* status */, rate_limiter_priority,
213
- false /* for_compaction */)) {
214
- return IOStatus::Corruption("file read failed");
212
+ io_s = reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr,
213
+ rate_limiter_priority);
214
+ if (!io_s.ok()) {
215
+ return IOStatus::Corruption("file read failed with error: " +
216
+ io_s.ToString());
215
217
  }
216
218
  if (slice.size() == 0) {
217
219
  return IOStatus::Corruption("file too small");
@@ -219,6 +221,8 @@ IOStatus GenerateOneFileChecksum(
219
221
  checksum_generator->Update(slice.data(), slice.size());
220
222
  size -= slice.size();
221
223
  offset += slice.size();
224
+
225
+ TEST_SYNC_POINT("GenerateOneFileChecksum::Chunk:0");
222
226
  }
223
227
  checksum_generator->Finalize();
224
228
  *file_checksum = checksum_generator->GetChecksum();