@nxtedition/rocksdb 8.2.0-alpha.1 → 8.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/binding.cc +11 -74
  2. package/binding.gyp +7 -5
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
  4. package/deps/rocksdb/rocksdb/TARGETS +7 -0
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
  8. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
  10. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
  14. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
  17. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
  19. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
  20. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
  23. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
  24. package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
  25. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
  26. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
  29. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
  30. package/deps/rocksdb/rocksdb/db/c.cc +90 -1
  31. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
  32. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  33. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
  40. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
  41. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
  42. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
  43. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
  44. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
  51. package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
  52. package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
  53. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
  54. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
  55. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
  56. package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
  57. package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
  58. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
  60. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
  61. package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
  62. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  63. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
  64. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
  65. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
  66. package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
  67. package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
  68. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  69. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  70. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
  71. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
  72. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
  73. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
  74. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
  75. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
  76. package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
  77. package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
  78. package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
  79. package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
  80. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
  81. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
  89. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
  91. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
  92. package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
  93. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
  94. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
  95. package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
  96. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
  98. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
  99. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
  100. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
  102. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
  103. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
  104. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
  105. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
  107. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
  109. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  112. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
  113. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
  114. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
  115. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
  116. package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
  117. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
  118. package/deps/rocksdb/rocksdb/src.mk +4 -0
  119. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
  120. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
  121. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
  122. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
  123. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
  124. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
  125. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
  126. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
  128. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
  131. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
  132. package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
  133. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
  134. package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
  135. package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
  136. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
  137. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
  138. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
  139. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
  140. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
  141. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
  142. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
  143. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
  144. package/deps/rocksdb/rocksdb.gyp +6 -7
  145. package/index.js +0 -6
  146. package/package.json +1 -1
  147. package/prebuilds/linux-x64/node.napi.node +0 -0
  148. package/deps/liburing/liburing.gyp +0 -20
  149. package/tmp/test.js +0 -7
@@ -13,6 +13,11 @@
13
13
  #endif
14
14
  #include "util/random.h"
15
15
 
16
+ namespace {
17
+ static bool enable_io_uring = true;
18
+ extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; }
19
+ } // namespace
20
+
16
21
  namespace ROCKSDB_NAMESPACE {
17
22
 
18
23
  class MockFS;
@@ -125,6 +130,7 @@ TEST_P(PrefetchTest, Basic) {
125
130
  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
126
131
  Options options;
127
132
  SetGenericOptions(env.get(), use_direct_io, options);
133
+ options.statistics = CreateDBStatistics();
128
134
 
129
135
  const int kNumKeys = 1100;
130
136
  int buff_prefetch_count = 0;
@@ -167,9 +173,25 @@ TEST_P(PrefetchTest, Basic) {
167
173
  Slice least(start_key.data(), start_key.size());
168
174
  Slice greatest(end_key.data(), end_key.size());
169
175
 
176
+ HistogramData prev_table_open_prefetch_tail_read;
177
+ options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
178
+ &prev_table_open_prefetch_tail_read);
179
+ const uint64_t prev_table_open_prefetch_tail_miss =
180
+ options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS);
181
+ const uint64_t prev_table_open_prefetch_tail_hit =
182
+ options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
183
+
170
184
  // commenting out the line below causes the example to work correctly
171
185
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
172
186
 
187
+ HistogramData cur_table_open_prefetch_tail_read;
188
+ options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
189
+ &cur_table_open_prefetch_tail_read);
190
+ const uint64_t cur_table_open_prefetch_tail_miss =
191
+ options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS);
192
+ const uint64_t cur_table_open_prefetch_tail_hit =
193
+ options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
194
+
173
195
  if (support_prefetch && !use_direct_io) {
174
196
  // If underline file system supports prefetch, and directIO is not enabled
175
197
  // make sure prefetch() is called and FilePrefetchBuffer is not used.
@@ -182,6 +204,12 @@ TEST_P(PrefetchTest, Basic) {
182
204
  // used.
183
205
  ASSERT_FALSE(fs->IsPrefetchCalled());
184
206
  ASSERT_GT(buff_prefetch_count, 0);
207
+ ASSERT_GT(cur_table_open_prefetch_tail_read.count,
208
+ prev_table_open_prefetch_tail_read.count);
209
+ ASSERT_GT(cur_table_open_prefetch_tail_hit,
210
+ prev_table_open_prefetch_tail_hit);
211
+ ASSERT_GE(cur_table_open_prefetch_tail_miss,
212
+ prev_table_open_prefetch_tail_miss);
185
213
  buff_prefetch_count = 0;
186
214
  }
187
215
 
@@ -1156,6 +1184,104 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) {
1156
1184
  Close();
1157
1185
  }
1158
1186
 
1187
+ TEST_P(PrefetchTest, DBIterAsyncIONoIOUring) {
1188
+ if (mem_env_ || encrypted_env_) {
1189
+ ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
1190
+ return;
1191
+ }
1192
+
1193
+ const int kNumKeys = 1000;
1194
+ // Set options
1195
+ bool use_direct_io = std::get<0>(GetParam());
1196
+ bool is_adaptive_readahead = std::get<1>(GetParam());
1197
+
1198
+ Options options;
1199
+ SetGenericOptions(Env::Default(), use_direct_io, options);
1200
+ options.statistics = CreateDBStatistics();
1201
+ BlockBasedTableOptions table_options;
1202
+ SetBlockBasedTableOptions(table_options);
1203
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1204
+
1205
+ enable_io_uring = false;
1206
+ Status s = TryReopen(options);
1207
+ if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
1208
+ // If direct IO is not supported, skip the test
1209
+ return;
1210
+ } else {
1211
+ ASSERT_OK(s);
1212
+ }
1213
+
1214
+ WriteBatch batch;
1215
+ Random rnd(309);
1216
+ int total_keys = 0;
1217
+ for (int j = 0; j < 5; j++) {
1218
+ for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
1219
+ ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
1220
+ total_keys++;
1221
+ }
1222
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1223
+ ASSERT_OK(Flush());
1224
+ }
1225
+ MoveFilesToLevel(2);
1226
+
1227
+ // Test - Iterate over the keys sequentially.
1228
+ {
1229
+ ReadOptions ro;
1230
+ if (is_adaptive_readahead) {
1231
+ ro.adaptive_readahead = true;
1232
+ }
1233
+ ro.async_io = true;
1234
+
1235
+ ASSERT_OK(options.statistics->Reset());
1236
+
1237
+ auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
1238
+ int num_keys = 0;
1239
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
1240
+ ASSERT_OK(iter->status());
1241
+ num_keys++;
1242
+ }
1243
+ ASSERT_EQ(num_keys, total_keys);
1244
+
1245
+ // Check stats to make sure async prefetch is done.
1246
+ {
1247
+ HistogramData async_read_bytes;
1248
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1249
+ ASSERT_EQ(async_read_bytes.count, 0);
1250
+ ASSERT_EQ(options.statistics->getTickerCount(READ_ASYNC_MICROS), 0);
1251
+ }
1252
+ }
1253
+
1254
+ {
1255
+ ReadOptions ro;
1256
+ if (is_adaptive_readahead) {
1257
+ ro.adaptive_readahead = true;
1258
+ }
1259
+ ro.async_io = true;
1260
+ ro.tailing = true;
1261
+
1262
+ ASSERT_OK(options.statistics->Reset());
1263
+
1264
+ auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
1265
+ int num_keys = 0;
1266
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
1267
+ ASSERT_OK(iter->status());
1268
+ num_keys++;
1269
+ }
1270
+ ASSERT_EQ(num_keys, total_keys);
1271
+
1272
+ // Check stats to make sure async prefetch is done.
1273
+ {
1274
+ HistogramData async_read_bytes;
1275
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1276
+ ASSERT_EQ(async_read_bytes.count, 0);
1277
+ ASSERT_EQ(options.statistics->getTickerCount(READ_ASYNC_MICROS), 0);
1278
+ }
1279
+ }
1280
+ Close();
1281
+
1282
+ enable_io_uring = true;
1283
+ }
1284
+
1159
1285
  class PrefetchTest1 : public DBTestBase,
1160
1286
  public ::testing::WithParamInterface<bool> {
1161
1287
  public:
@@ -1504,8 +1630,6 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) {
1504
1630
  Close();
1505
1631
  }
1506
1632
 
1507
- extern "C" bool RocksDbIOUringEnable() { return true; }
1508
-
1509
1633
  namespace {
1510
1634
  #ifdef GFLAGS
1511
1635
  const int kMaxArgCount = 100;
@@ -1624,7 +1748,8 @@ TEST_P(PrefetchTest, ReadAsyncWithPosixFS) {
1624
1748
  } else {
1625
1749
  // Not all platforms support iouring. In that case, ReadAsync in posix
1626
1750
  // won't submit async requests.
1627
- ASSERT_EQ(iter->status(), Status::NotSupported());
1751
+ ASSERT_EQ(num_keys, total_keys);
1752
+ ASSERT_EQ(buff_prefetch_count, 0);
1628
1753
  }
1629
1754
  }
1630
1755
 
@@ -1737,18 +1862,19 @@ TEST_P(PrefetchTest, MultipleSeekWithPosixFS) {
1737
1862
  iter->Next();
1738
1863
  }
1739
1864
 
1865
+ ASSERT_OK(iter->status());
1866
+ ASSERT_EQ(num_keys, num_keys_first_batch);
1867
+ // Check stats to make sure async prefetch is done.
1868
+ HistogramData async_read_bytes;
1869
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1740
1870
  if (read_async_called) {
1741
- ASSERT_OK(iter->status());
1742
- ASSERT_EQ(num_keys, num_keys_first_batch);
1743
- // Check stats to make sure async prefetch is done.
1744
- HistogramData async_read_bytes;
1745
- options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1746
1871
  ASSERT_GT(async_read_bytes.count, 0);
1747
1872
  ASSERT_GT(get_perf_context()->number_async_seek, 0);
1748
1873
  } else {
1749
1874
  // Not all platforms support iouring. In that case, ReadAsync in posix
1750
1875
  // won't submit async requests.
1751
- ASSERT_EQ(iter->status(), Status::NotSupported());
1876
+ ASSERT_EQ(async_read_bytes.count, 0);
1877
+ ASSERT_EQ(get_perf_context()->number_async_seek, 0);
1752
1878
  }
1753
1879
  }
1754
1880
 
@@ -1765,25 +1891,26 @@ TEST_P(PrefetchTest, MultipleSeekWithPosixFS) {
1765
1891
  iter->Next();
1766
1892
  }
1767
1893
 
1768
- if (read_async_called) {
1769
- ASSERT_OK(iter->status());
1770
- ASSERT_EQ(num_keys, num_keys_second_batch);
1894
+ ASSERT_OK(iter->status());
1895
+ ASSERT_EQ(num_keys, num_keys_second_batch);
1896
+ HistogramData async_read_bytes;
1897
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1898
+ HistogramData prefetched_bytes_discarded;
1899
+ options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
1900
+ &prefetched_bytes_discarded);
1901
+ ASSERT_GT(prefetched_bytes_discarded.count, 0);
1771
1902
 
1903
+ if (read_async_called) {
1772
1904
  ASSERT_GT(buff_prefetch_count, 0);
1773
1905
 
1774
1906
  // Check stats to make sure async prefetch is done.
1775
- HistogramData async_read_bytes;
1776
- options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
1777
- HistogramData prefetched_bytes_discarded;
1778
- options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
1779
- &prefetched_bytes_discarded);
1780
1907
  ASSERT_GT(async_read_bytes.count, 0);
1781
1908
  ASSERT_GT(get_perf_context()->number_async_seek, 0);
1782
- ASSERT_GT(prefetched_bytes_discarded.count, 0);
1783
1909
  } else {
1784
1910
  // Not all platforms support iouring. In that case, ReadAsync in posix
1785
1911
  // won't submit async requests.
1786
- ASSERT_EQ(iter->status(), Status::NotSupported());
1912
+ ASSERT_EQ(async_read_bytes.count, 0);
1913
+ ASSERT_EQ(get_perf_context()->number_async_seek, 0);
1787
1914
  }
1788
1915
  }
1789
1916
  }
@@ -1862,51 +1989,44 @@ TEST_P(PrefetchTest, SeekParallelizationTestWithPosix) {
1862
1989
  // Each block contains around 4 keys.
1863
1990
  auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
1864
1991
  iter->Seek(BuildKey(0)); // Prefetch data because of seek parallelization.
1865
- if (std::get<1>(GetParam()) && !read_async_called) {
1866
- ASSERT_EQ(iter->status(), Status::NotSupported());
1867
- } else {
1868
- ASSERT_TRUE(iter->Valid());
1869
- iter->Next();
1870
- ASSERT_TRUE(iter->Valid());
1871
- iter->Next();
1872
- ASSERT_TRUE(iter->Valid());
1873
- iter->Next();
1874
- ASSERT_TRUE(iter->Valid());
1992
+ ASSERT_TRUE(iter->Valid());
1993
+ iter->Next();
1994
+ ASSERT_TRUE(iter->Valid());
1995
+ iter->Next();
1996
+ ASSERT_TRUE(iter->Valid());
1997
+ iter->Next();
1998
+ ASSERT_TRUE(iter->Valid());
1875
1999
 
1876
- // New data block. Since num_file_reads in FilePrefetch after this read is
1877
- // 2, it won't go for prefetching.
1878
- iter->Next();
1879
- ASSERT_TRUE(iter->Valid());
1880
- iter->Next();
1881
- ASSERT_TRUE(iter->Valid());
1882
- iter->Next();
1883
- ASSERT_TRUE(iter->Valid());
1884
- iter->Next();
1885
- ASSERT_TRUE(iter->Valid());
2000
+ // New data block. Since num_file_reads in FilePrefetch after this read is
2001
+ // 2, it won't go for prefetching.
2002
+ iter->Next();
2003
+ ASSERT_TRUE(iter->Valid());
2004
+ iter->Next();
2005
+ ASSERT_TRUE(iter->Valid());
2006
+ iter->Next();
2007
+ ASSERT_TRUE(iter->Valid());
2008
+ iter->Next();
2009
+ ASSERT_TRUE(iter->Valid());
1886
2010
 
1887
- // Prefetch data.
1888
- iter->Next();
2011
+ // Prefetch data.
2012
+ iter->Next();
1889
2013
 
1890
- if (read_async_called) {
1891
- ASSERT_TRUE(iter->Valid());
1892
- // Check stats to make sure async prefetch is done.
1893
- {
1894
- HistogramData async_read_bytes;
1895
- options.statistics->histogramData(ASYNC_READ_BYTES,
1896
- &async_read_bytes);
1897
- ASSERT_GT(async_read_bytes.count, 0);
1898
- ASSERT_GT(get_perf_context()->number_async_seek, 0);
1899
- if (std::get<1>(GetParam())) {
1900
- ASSERT_EQ(buff_prefetch_count, 1);
1901
- } else {
1902
- ASSERT_EQ(buff_prefetch_count, 2);
1903
- }
1904
- }
2014
+ ASSERT_TRUE(iter->Valid());
2015
+ HistogramData async_read_bytes;
2016
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
2017
+ if (read_async_called) {
2018
+ ASSERT_GT(async_read_bytes.count, 0);
2019
+ ASSERT_GT(get_perf_context()->number_async_seek, 0);
2020
+ if (std::get<1>(GetParam())) {
2021
+ ASSERT_EQ(buff_prefetch_count, 1);
1905
2022
  } else {
1906
- // Not all platforms support iouring. In that case, ReadAsync in posix
1907
- // won't submit async requests.
1908
- ASSERT_EQ(iter->status(), Status::NotSupported());
2023
+ ASSERT_EQ(buff_prefetch_count, 2);
1909
2024
  }
2025
+ } else {
2026
+ // Not all platforms support iouring. In that case, ReadAsync in posix
2027
+ // won't submit async requests.
2028
+ ASSERT_EQ(async_read_bytes.count, 0);
2029
+ ASSERT_EQ(get_perf_context()->number_async_seek, 0);
1910
2030
  }
1911
2031
  }
1912
2032
  Close();
@@ -2000,17 +2120,17 @@ TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) {
2000
2120
  ASSERT_OK(db_->EndIOTrace());
2001
2121
  ASSERT_OK(env_->FileExists(trace_file_path));
2002
2122
 
2123
+ ASSERT_EQ(num_keys, total_keys);
2124
+ HistogramData async_read_bytes;
2125
+ options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
2003
2126
  if (read_async_called) {
2004
- ASSERT_EQ(num_keys, total_keys);
2005
2127
  ASSERT_GT(buff_prefetch_count, 0);
2006
2128
  // Check stats to make sure async prefetch is done.
2007
- HistogramData async_read_bytes;
2008
- options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
2009
2129
  ASSERT_GT(async_read_bytes.count, 0);
2010
2130
  } else {
2011
2131
  // Not all platforms support iouring. In that case, ReadAsync in posix
2012
2132
  // won't submit async requests.
2013
- ASSERT_EQ(iter->status(), Status::NotSupported());
2133
+ ASSERT_EQ(async_read_bytes.count, 0);
2014
2134
  }
2015
2135
 
2016
2136
  // Check the file to see if ReadAsync is logged.
@@ -20,6 +20,7 @@
20
20
  namespace ROCKSDB_NAMESPACE {
21
21
 
22
22
  class Logger;
23
+ class SecondaryCacheResultHandle;
23
24
  class Statistics;
24
25
 
25
26
  // A Cache maps keys to objects resident in memory, tracks reference counts
@@ -134,28 +135,38 @@ class Cache {
134
135
  CreateCallback create_cb;
135
136
  // Classification of the entry for monitoring purposes in block cache.
136
137
  CacheEntryRole role;
137
-
138
- constexpr CacheItemHelper()
139
- : del_cb(nullptr),
140
- size_cb(nullptr),
141
- saveto_cb(nullptr),
142
- create_cb(nullptr),
143
- role(CacheEntryRole::kMisc) {}
144
-
145
- explicit constexpr CacheItemHelper(CacheEntryRole _role,
146
- DeleterFn _del_cb = nullptr,
147
- SizeCallback _size_cb = nullptr,
148
- SaveToCallback _saveto_cb = nullptr,
149
- CreateCallback _create_cb = nullptr)
138
+ // Another CacheItemHelper (or this one) without secondary cache support.
139
+ // This is provided so that items promoted from secondary cache into
140
+ // primary cache without removal from the secondary cache can be prevented
141
+ // from attempting re-insertion into secondary cache (for efficiency).
142
+ const CacheItemHelper* without_secondary_compat;
143
+
144
+ CacheItemHelper() : CacheItemHelper(CacheEntryRole::kMisc) {}
145
+
146
+ // For helpers without SecondaryCache support
147
+ explicit CacheItemHelper(CacheEntryRole _role, DeleterFn _del_cb = nullptr)
148
+ : CacheItemHelper(_role, _del_cb, nullptr, nullptr, nullptr, this) {}
149
+
150
+ // For helpers with SecondaryCache support
151
+ explicit CacheItemHelper(CacheEntryRole _role, DeleterFn _del_cb,
152
+ SizeCallback _size_cb, SaveToCallback _saveto_cb,
153
+ CreateCallback _create_cb,
154
+ const CacheItemHelper* _without_secondary_compat)
150
155
  : del_cb(_del_cb),
151
156
  size_cb(_size_cb),
152
157
  saveto_cb(_saveto_cb),
153
158
  create_cb(_create_cb),
154
- role(_role) {
159
+ role(_role),
160
+ without_secondary_compat(_without_secondary_compat) {
155
161
  // Either all three secondary cache callbacks are non-nullptr or
156
162
  // all three are nullptr
157
163
  assert((size_cb != nullptr) == (saveto_cb != nullptr));
158
164
  assert((size_cb != nullptr) == (create_cb != nullptr));
165
+ // without_secondary_compat points to equivalent but without
166
+ // secondary support
167
+ assert(role == without_secondary_compat->role);
168
+ assert(del_cb == without_secondary_compat->del_cb);
169
+ assert(!without_secondary_compat->IsSecondaryCacheCompatible());
159
170
  }
160
171
  inline bool IsSecondaryCacheCompatible() const {
161
172
  return size_cb != nullptr;
@@ -238,6 +249,19 @@ class Cache {
238
249
  Handle** handle = nullptr,
239
250
  Priority priority = Priority::LOW) = 0;
240
251
 
252
+ // Similar to Insert, but used for creating cache entries that cannot
253
+ // be found with Lookup, such as for memory charging purposes. The
254
+ // key is needed for cache sharding purposes.
255
+ // * If allow_uncharged==true or strict_capacity_limit=false, the operation
256
+ // always succeeds and returns a valid Handle.
257
+ // * If strict_capacity_limit=true and the requested charge cannot be freed
258
+ // up in the cache, then
259
+ // * If allow_uncharged==true, it's created anyway (GetCharge() == 0).
260
+ // * If allow_uncharged==false, returns nullptr to indicate failure.
261
+ virtual Handle* CreateStandalone(const Slice& key, ObjectPtr obj,
262
+ const CacheItemHelper* helper, size_t charge,
263
+ bool allow_uncharged) = 0;
264
+
241
265
  // Lookup the key, returning nullptr if not found. If found, returns
242
266
  // a handle to the mapping that must eventually be passed to Release().
243
267
  //
@@ -248,41 +272,15 @@ class Cache {
248
272
  // used to promote the entry to an object in the primary cache.
249
273
  // In that case, the helper may be saved and used later when the object
250
274
  // is evicted, so as usual, the pointed-to helper must outlive the cache.
251
- //
252
- // ======================== Async Lookup (wait=false) ======================
253
- // When wait=false, the handle returned might be in any of three states:
254
- // * Present - If Value() != nullptr, then the result is present and
255
- // the handle can be used just as if wait=true.
256
- // * Pending, not ready (IsReady() == false) - secondary cache is still
257
- // working to retrieve the value. Might become ready any time.
258
- // * Pending, ready (IsReady() == true) - secondary cache has the value
259
- // but it has not been loaded as an object into primary cache. Call to
260
- // Wait()/WaitAll() will not block.
261
- //
262
- // IMPORTANT: Pending handles are not thread-safe, and only these functions
263
- // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release()
264
- // can only come after Wait() or WaitAll() even though a reference is held.
265
- //
266
- // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is
267
- // safe and has no effect on other handle states.) After waiting on a Handle,
268
- // it is in one of two states:
269
- // * Present - if Value() != nullptr
270
- // * Failed - if Value() == nullptr, such as if the secondary cache
271
- // initially thought it had the value but actually did not.
272
- //
273
- // Note that given an arbitrary Handle, the only way to distinguish the
274
- // Pending+ready state from the Failed state is to Wait() on it. A cache
275
- // entry not compatible with secondary cache can also have Value()==nullptr
276
- // like the Failed state, but this is not generally a concern.
277
275
  virtual Handle* Lookup(const Slice& key,
278
276
  const CacheItemHelper* helper = nullptr,
279
277
  CreateContext* create_context = nullptr,
280
- Priority priority = Priority::LOW, bool wait = true,
278
+ Priority priority = Priority::LOW,
281
279
  Statistics* stats = nullptr) = 0;
282
280
 
283
281
  // Convenience wrapper when secondary cache not supported
284
282
  inline Handle* BasicLookup(const Slice& key, Statistics* stats) {
285
- return Lookup(key, nullptr, nullptr, Priority::LOW, true, stats);
283
+ return Lookup(key, nullptr, nullptr, Priority::LOW, stats);
286
284
  }
287
285
 
288
286
  // Increments the reference count for the handle if it refers to an entry in
@@ -419,28 +417,109 @@ class Cache {
419
417
  return Release(handle, erase_if_last_ref);
420
418
  }
421
419
 
422
- // Determines if the handle returned by Lookup() can give a value without
423
- // blocking, though Wait()/WaitAll() might be required to publish it to
424
- // Value(). See secondary cache compatible Lookup() above for details.
425
- // This call is not thread safe on "pending" handles.
426
- virtual bool IsReady(Handle* /*handle*/) { return true; }
427
-
428
- // Convert a "pending" handle into a full thread-shareable handle by
429
- // * If necessary, wait until secondary cache finishes loading the value.
430
- // * Construct the object for primary cache and set it in the handle.
431
- // Even after Wait() on a pending handle, the caller must check for
432
- // Value() == nullptr in case of failure. This call is not thread-safe
433
- // on pending handles. This call has no effect on non-pending handles.
434
- // See secondary cache compatible Lookup() above for details.
435
- virtual void Wait(Handle* /*handle*/) {}
436
-
437
- // Wait for a vector of handles to become ready. As with Wait(), the user
438
- // should check the Value() of each handle for nullptr. This call is not
439
- // thread-safe on pending handles.
440
- virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
441
-
442
- private:
420
+ // A temporary handle structure for managing async lookups, which callers
421
+ // of AsyncLookup() can allocate on the call stack for efficiency.
422
+ // An AsyncLookupHandle should not be used concurrently across threads.
423
+ struct AsyncLookupHandle {
424
+ // Inputs, populated by caller:
425
+ // NOTE: at least in case of stacked secondary caches, the underlying
426
+ // key buffer must last until handle is completely waited on.
427
+ Slice key;
428
+ const CacheItemHelper* helper = nullptr;
429
+ CreateContext* create_context = nullptr;
430
+ Priority priority = Priority::LOW;
431
+ Statistics* stats = nullptr;
432
+
433
+ AsyncLookupHandle() {}
434
+ AsyncLookupHandle(const Slice& _key, const CacheItemHelper* _helper,
435
+ CreateContext* _create_context,
436
+ Priority _priority = Priority::LOW,
437
+ Statistics* _stats = nullptr)
438
+ : key(_key),
439
+ helper(_helper),
440
+ create_context(_create_context),
441
+ priority(_priority),
442
+ stats(_stats) {}
443
+
444
+ // AsyncLookupHandle should only be destroyed when no longer pending
445
+ ~AsyncLookupHandle() { assert(!IsPending()); }
446
+
447
+ // No copies or moves (StartAsyncLookup may save a pointer to this)
448
+ AsyncLookupHandle(const AsyncLookupHandle&) = delete;
449
+ AsyncLookupHandle operator=(const AsyncLookupHandle&) = delete;
450
+ AsyncLookupHandle(AsyncLookupHandle&&) = delete;
451
+ AsyncLookupHandle operator=(AsyncLookupHandle&&) = delete;
452
+
453
+ // Determines if the handle returned by Lookup() can give a value without
454
+ // blocking, though Wait()/WaitAll() might be required to publish it to
455
+ // Value(). See secondary cache compatible Lookup() above for details.
456
+ // This call is not thread safe on "pending" handles.
457
+ // WART/TODO with stacked secondaries: might indicate ready when one
458
+ // result is ready (a miss) but the next lookup will block.
459
+ bool IsReady();
460
+
461
+ // Returns true if Wait/WaitAll is required before calling Result().
462
+ bool IsPending();
463
+
464
+ // Returns a Lookup()-like result if this AsyncHandle is not pending.
465
+ // (Undefined behavior on a pending AsyncHandle.) Like Lookup(), the
466
+ // caller is responsible for eventually Release()ing a non-nullptr
467
+ // Handle* result.
468
+ Handle* Result();
469
+
470
+ // Implementation details, for RocksDB internal use only
471
+ Handle* result_handle = nullptr;
472
+ SecondaryCacheResultHandle* pending_handle = nullptr;
473
+ SecondaryCache* pending_cache = nullptr;
474
+ bool found_dummy_entry = false;
475
+ bool kept_in_sec_cache = false;
476
+ };
477
+
478
+ // Starts a potentially asynchronous Lookup(), based on the populated
479
+ // "input" fields of the async_handle. The caller is responsible for
480
+ // keeping the AsyncLookupHandle and the key it references alive through
481
+ // WaitAll(), and the AsyncLookupHandle alive through
482
+ // AsyncLookupHandle::Result(). WaitAll() can only be skipped if
483
+ // AsyncLookupHandle::IsPending() is already false after StartAsyncLookup.
484
+ // Calling AsyncLookupHandle::Result() is essentially required so that
485
+ // Release() can be called on non-nullptr Handle result. Wait() is a
486
+ // concise version of WaitAll()+Result() on a single handle. After an
487
+ // AsyncLookupHandle has completed this cycle, its input fields can be
488
+ // updated and re-used for another StartAsyncLookup.
489
+ //
490
+ // Handle is thread-safe while AsyncLookupHandle is not thread-safe.
491
+ //
492
+ // Default implementation is appropriate for Caches without
493
+ // true asynchronous support: defers to synchronous Lookup().
494
+ // (AsyncLookupHandles will only get into the "pending" state with
495
+ // SecondaryCache configured.)
496
+ virtual void StartAsyncLookup(AsyncLookupHandle& async_handle);
497
+
498
+ // A convenient wrapper around WaitAll() and AsyncLookupHandle::Result()
499
+ // for a single async handle. See StartAsyncLookup().
500
+ Handle* Wait(AsyncLookupHandle& async_handle);
501
+
502
+ // Wait for an array of async handles to get results, so that none are left
503
+ // in the "pending" state. Not thread safe. See StartAsyncLookup().
504
+ // Default implementation is appropriate for Caches without true
505
+ // asynchronous support: asserts that all handles are not pending (or not
506
+ // expected to be handled by this cache, in case of wrapped/stacked
507
+ // WaitAlls()).
508
+ virtual void WaitAll(AsyncLookupHandle* /*async_handles*/, size_t /*count*/);
509
+
510
+ // For a function called on cache entries about to be evicted. The function
511
+ // returns `true` if it has taken ownership of the Value (object), or
512
+ // `false` if the cache should destroy it as usual. Regardless, Ref() and
513
+ // Release() cannot be called on this Handle that is poised for eviction.
514
+ using EvictionCallback = std::function<bool(const Slice& key, Handle* h)>;
515
+ // Sets an eviction callback for this Cache. Not thread safe and only
516
+ // supports being set once, so should only be used during initialization
517
+ // or destruction, guaranteed before or after any thread-shared operations.
518
+ void SetEvictionCallback(EvictionCallback&& fn);
519
+
520
+ protected:
443
521
  std::shared_ptr<MemoryAllocator> memory_allocator_;
522
+ EvictionCallback eviction_callback_;
444
523
  };
445
524
 
446
525
  // A wrapper around Cache that can easily be extended with instrumentation,
@@ -460,11 +539,17 @@ class CacheWrapper : public Cache {
460
539
  return target_->Insert(key, value, helper, charge, handle, priority);
461
540
  }
462
541
 
542
+ Handle* CreateStandalone(const Slice& key, ObjectPtr obj,
543
+ const CacheItemHelper* helper, size_t charge,
544
+ bool allow_uncharged) override {
545
+ return target_->CreateStandalone(key, obj, helper, charge, allow_uncharged);
546
+ }
547
+
463
548
  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
464
549
  CreateContext* create_context,
465
- Priority priority = Priority::LOW, bool wait = true,
550
+ Priority priority = Priority::LOW,
466
551
  Statistics* stats = nullptr) override {
467
- return target_->Lookup(key, helper, create_context, priority, wait, stats);
552
+ return target_->Lookup(key, helper, create_context, priority, stats);
468
553
  }
469
554
 
470
555
  bool Ref(Handle* handle) override { return target_->Ref(handle); }
@@ -516,12 +601,20 @@ class CacheWrapper : public Cache {
516
601
 
517
602
  void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
518
603
 
604
+ void StartAsyncLookup(AsyncLookupHandle& async_handle) override {
605
+ target_->StartAsyncLookup(async_handle);
606
+ }
607
+
608
+ void WaitAll(AsyncLookupHandle* async_handles, size_t count) override {
609
+ target_->WaitAll(async_handles, count);
610
+ }
611
+
519
612
  protected:
520
613
  std::shared_ptr<Cache> target_;
521
614
  };
522
615
 
523
616
  // Useful for cache entries requiring no clean-up, such as for cache
524
617
  // reservations
525
- inline constexpr Cache::CacheItemHelper kNoopCacheItemHelper{};
618
+ extern const Cache::CacheItemHelper kNoopCacheItemHelper;
526
619
 
527
620
  } // namespace ROCKSDB_NAMESPACE
@@ -863,7 +863,9 @@ struct AdvancedColumnFamilyOptions {
863
863
  // age is based on the file's last modified time (given by the underlying
864
864
  // Env).
865
865
  //
866
- // Supported in Level and FIFO compaction.
866
+ // Supported in all compaction styles.
867
+ // In Universal compaction, rocksdb will try to do a full compaction when
868
+ // possible, see more in UniversalCompactionBuilder::PickPeriodicCompaction().
867
869
  // In FIFO compaction, this option has the same meaning as TTL and whichever
868
870
  // stricter will be used.
869
871
  // Pre-req: max_open_file == -1.