@nxtedition/rocksdb 7.0.0 → 7.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/binding.cc +38 -40
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +1 -1
  3. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +3 -1
  4. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +28 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_test.cc +5 -2
  8. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +48 -60
  9. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +18 -20
  10. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  11. package/deps/rocksdb/rocksdb/db/c.cc +5 -0
  12. package/deps/rocksdb/rocksdb/db/column_family.cc +20 -0
  13. package/deps/rocksdb/rocksdb/db/column_family.h +9 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +44 -26
  15. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +32 -14
  16. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +73 -44
  17. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +3 -1
  18. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +6 -1
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +10 -5
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +47 -35
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -1
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +54 -32
  23. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +426 -61
  24. package/deps/rocksdb/rocksdb/db/db_options_test.cc +1 -0
  25. package/deps/rocksdb/rocksdb/db/db_test.cc +102 -24
  26. package/deps/rocksdb/rocksdb/db/db_test2.cc +159 -30
  27. package/deps/rocksdb/rocksdb/db/db_test_util.cc +1 -0
  28. package/deps/rocksdb/rocksdb/db/dbformat.h +1 -1
  29. package/deps/rocksdb/rocksdb/db/version_builder.cc +39 -10
  30. package/deps/rocksdb/rocksdb/db/version_builder.h +4 -1
  31. package/deps/rocksdb/rocksdb/db/version_edit.h +20 -0
  32. package/deps/rocksdb/rocksdb/db/version_set.cc +2 -1
  33. package/deps/rocksdb/rocksdb/db/version_set.h +17 -2
  34. package/deps/rocksdb/rocksdb/db/version_set_test.cc +119 -0
  35. package/deps/rocksdb/rocksdb/db/write_batch.cc +96 -0
  36. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -0
  37. package/deps/rocksdb/rocksdb/db/write_thread.cc +1 -0
  38. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -0
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
  40. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +18 -2
  41. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -0
  42. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +12 -0
  43. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +1 -1
  44. package/deps/rocksdb/rocksdb/env/fs_posix.cc +96 -6
  45. package/deps/rocksdb/rocksdb/env/io_posix.cc +51 -18
  46. package/deps/rocksdb/rocksdb/env/io_posix.h +2 -0
  47. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +12 -5
  48. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +22 -6
  49. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +99 -8
  50. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +9 -1
  51. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +3 -0
  52. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -0
  53. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +4 -0
  54. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +1 -1
  55. package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +7 -0
  56. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +11 -1
  57. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +4 -1
  58. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +14 -1
  59. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +6 -0
  60. package/deps/rocksdb/rocksdb/options/cf_options.cc +12 -1
  61. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  62. package/deps/rocksdb/rocksdb/options/options.cc +8 -1
  63. package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
  64. package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -1
  65. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +7 -2
  66. package/deps/rocksdb/rocksdb/options/options_test.cc +52 -0
  67. package/deps/rocksdb/rocksdb/port/port_posix.h +10 -1
  68. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +1 -1
  69. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +1 -1
  70. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  71. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +5 -5
  72. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -10
  73. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +1 -1
  74. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +1 -1
  75. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  76. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  77. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +39 -12
  78. package/deps/rocksdb/rocksdb/util/comparator.cc +10 -0
  79. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +1 -1
  80. package/deps/rocksdb/rocksdb/util/xxhash.h +2 -1
  81. package/index.js +2 -2
  82. package/package.json +1 -1
  83. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  84. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -321,7 +321,7 @@ TEST_F(DBTest, MixedSlowdownOptions) {
321
321
  // We need the 2nd write to trigger delay. This is because delay is
322
322
  // estimated based on the last write size which is 0 for the first write.
323
323
  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
324
- token.reset();
324
+ token.reset();
325
325
 
326
326
  for (auto& t : threads) {
327
327
  t.join();
@@ -379,7 +379,7 @@ TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
379
379
  // We need the 2nd write to trigger delay. This is because delay is
380
380
  // estimated based on the last write size which is 0 for the first write.
381
381
  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
382
- token.reset();
382
+ token.reset();
383
383
 
384
384
  for (auto& t : threads) {
385
385
  t.join();
@@ -448,7 +448,7 @@ TEST_F(DBTest, MixedSlowdownOptionsStop) {
448
448
  // We need the 2nd write to trigger delay. This is because delay is
449
449
  // estimated based on the last write size which is 0 for the first write.
450
450
  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
451
- token.reset();
451
+ token.reset();
452
452
 
453
453
  for (auto& t : threads) {
454
454
  t.join();
@@ -483,7 +483,6 @@ TEST_F(DBTest, LevelLimitReopen) {
483
483
  }
484
484
  #endif // ROCKSDB_LITE
485
485
 
486
-
487
486
  TEST_F(DBTest, PutSingleDeleteGet) {
488
487
  do {
489
488
  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -782,7 +781,6 @@ TEST_F(DBTest, GetFromImmutableLayer) {
782
781
  } while (ChangeOptions());
783
782
  }
784
783
 
785
-
786
784
  TEST_F(DBTest, GetLevel0Ordering) {
787
785
  do {
788
786
  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -3807,7 +3805,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
3807
3805
 
3808
3806
  options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
3809
3807
  options.compaction_options_fifo.allow_compaction = false;
3810
- options.ttl = 1 * 60 * 60 ; // 1 hour
3808
+ options.ttl = 1 * 60 * 60; // 1 hour
3811
3809
  options = CurrentOptions(options);
3812
3810
  DestroyAndReopen(options);
3813
3811
 
@@ -3881,7 +3879,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
3881
3879
  options.write_buffer_size = 10 << 10; // 10KB
3882
3880
  options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB
3883
3881
  options.compaction_options_fifo.allow_compaction = false;
3884
- options.ttl = 1 * 60 * 60; // 1 hour
3882
+ options.ttl = 1 * 60 * 60; // 1 hour
3885
3883
  options = CurrentOptions(options);
3886
3884
  DestroyAndReopen(options);
3887
3885
 
@@ -6070,7 +6068,6 @@ TEST_F(DBTest, DISABLED_SuggestCompactRangeTest) {
6070
6068
  ASSERT_EQ(1, NumTableFilesAtLevel(1));
6071
6069
  }
6072
6070
 
6073
-
6074
6071
  TEST_F(DBTest, PromoteL0) {
6075
6072
  Options options = CurrentOptions();
6076
6073
  options.disable_auto_compactions = true;
@@ -6251,13 +6248,12 @@ TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
6251
6248
  SyncPoint::GetInstance()->EnableProcessing();
6252
6249
 
6253
6250
  port::Thread manual_compaction_thread([&]() {
6254
- auto s = db_->CompactFiles(CompactionOptions(),
6255
- db_->DefaultColumnFamily(), input_files, 0);
6256
- ASSERT_OK(s);
6251
+ auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(),
6252
+ input_files, 0);
6253
+ ASSERT_OK(s);
6257
6254
  });
6258
6255
 
6259
- TEST_SYNC_POINT(
6260
- "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
6256
+ TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
6261
6257
  // generate enough files to trigger compaction
6262
6258
  for (int i = 0; i < 20; ++i) {
6263
6259
  for (int j = 0; j < 2; ++j) {
@@ -6267,16 +6263,15 @@ TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
6267
6263
  }
6268
6264
  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
6269
6265
  ASSERT_GT(cf_meta_data.levels[0].files.size(),
6270
- options.level0_file_num_compaction_trigger);
6271
- TEST_SYNC_POINT(
6272
- "DBTest::CompactFilesShouldTriggerAutoCompaction:End");
6266
+ options.level0_file_num_compaction_trigger);
6267
+ TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:End");
6273
6268
 
6274
6269
  manual_compaction_thread.join();
6275
6270
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
6276
6271
 
6277
6272
  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
6278
6273
  ASSERT_LE(cf_meta_data.levels[0].files.size(),
6279
- options.level0_file_num_compaction_trigger);
6274
+ options.level0_file_num_compaction_trigger);
6280
6275
  }
6281
6276
  #endif // ROCKSDB_LITE
6282
6277
 
@@ -6501,8 +6496,9 @@ class WriteStallListener : public EventListener {
6501
6496
  MutexLock l(&mutex_);
6502
6497
  return expected == condition_;
6503
6498
  }
6499
+
6504
6500
  private:
6505
- port::Mutex mutex_;
6501
+ port::Mutex mutex_;
6506
6502
  WriteStallCondition condition_;
6507
6503
  };
6508
6504
 
@@ -6730,7 +6726,8 @@ TEST_F(DBTest, LastWriteBufferDelay) {
6730
6726
  sleeping_task.WakeUp();
6731
6727
  sleeping_task.WaitUntilDone();
6732
6728
  }
6733
- #endif // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
6729
+ #endif // !defined(ROCKSDB_LITE) &&
6730
+ // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
6734
6731
 
6735
6732
  TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
6736
6733
  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
@@ -6815,6 +6812,89 @@ TEST_F(DBTest, PinnableSliceAndRowCache) {
6815
6812
  1);
6816
6813
  }
6817
6814
 
6815
+ TEST_F(DBTest, ReusePinnableSlice) {
6816
+ Options options = CurrentOptions();
6817
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
6818
+ options.row_cache = NewLRUCache(8192);
6819
+ DestroyAndReopen(options);
6820
+
6821
+ ASSERT_OK(Put("foo", "bar"));
6822
+ ASSERT_OK(Flush());
6823
+
6824
+ ASSERT_EQ(Get("foo"), "bar");
6825
+ ASSERT_EQ(
6826
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6827
+ 1);
6828
+
6829
+ {
6830
+ PinnableSlice pin_slice;
6831
+ ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
6832
+ ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
6833
+ ASSERT_EQ(pin_slice.ToString(), "bar");
6834
+
6835
+ // Entry is already in cache, lookup will remove the element from lru
6836
+ ASSERT_EQ(
6837
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6838
+ 0);
6839
+ }
6840
+ // After PinnableSlice destruction element is added back in LRU
6841
+ ASSERT_EQ(
6842
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6843
+ 1);
6844
+
6845
+ {
6846
+ std::vector<Slice> multiget_keys;
6847
+ multiget_keys.push_back("foo");
6848
+ std::vector<PinnableSlice> multiget_values(1);
6849
+ std::vector<Status> statuses({Status::NotFound()});
6850
+ ReadOptions ropt;
6851
+ dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
6852
+ multiget_keys.size(), multiget_keys.data(),
6853
+ multiget_values.data(), statuses.data());
6854
+ ASSERT_EQ(Status::OK(), statuses[0]);
6855
+ dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
6856
+ multiget_keys.size(), multiget_keys.data(),
6857
+ multiget_values.data(), statuses.data());
6858
+ ASSERT_EQ(Status::OK(), statuses[0]);
6859
+
6860
+ // Entry is already in cache, lookup will remove the element from lru
6861
+ ASSERT_EQ(
6862
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6863
+ 0);
6864
+ }
6865
+ // After PinnableSlice destruction element is added back in LRU
6866
+ ASSERT_EQ(
6867
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6868
+ 1);
6869
+
6870
+ {
6871
+ std::vector<ColumnFamilyHandle*> multiget_cfs;
6872
+ multiget_cfs.push_back(dbfull()->DefaultColumnFamily());
6873
+ std::vector<Slice> multiget_keys;
6874
+ multiget_keys.push_back("foo");
6875
+ std::vector<PinnableSlice> multiget_values(1);
6876
+ std::vector<Status> statuses({Status::NotFound()});
6877
+ ReadOptions ropt;
6878
+ dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
6879
+ multiget_keys.data(), multiget_values.data(),
6880
+ statuses.data());
6881
+ ASSERT_EQ(Status::OK(), statuses[0]);
6882
+ dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
6883
+ multiget_keys.data(), multiget_values.data(),
6884
+ statuses.data());
6885
+ ASSERT_EQ(Status::OK(), statuses[0]);
6886
+
6887
+ // Entry is already in cache, lookup will remove the element from lru
6888
+ ASSERT_EQ(
6889
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6890
+ 0);
6891
+ }
6892
+ // After PinnableSlice destruction element is added back in LRU
6893
+ ASSERT_EQ(
6894
+ reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6895
+ 1);
6896
+ }
6897
+
6818
6898
  #endif // ROCKSDB_LITE
6819
6899
 
6820
6900
  TEST_F(DBTest, DeletingOldWalAfterDrop) {
@@ -6894,9 +6974,7 @@ TEST_F(DBTest, PauseBackgroundWorkTest) {
6894
6974
  TEST_F(DBTest, ThreadLocalPtrDeadlock) {
6895
6975
  std::atomic<int> flushes_done{0};
6896
6976
  std::atomic<int> threads_destroyed{0};
6897
- auto done = [&] {
6898
- return flushes_done.load() > 10;
6899
- };
6977
+ auto done = [&] { return flushes_done.load() > 10; };
6900
6978
 
6901
6979
  port::Thread flushing_thread([&] {
6902
6980
  for (int i = 0; !done(); ++i) {
@@ -6909,7 +6987,7 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) {
6909
6987
  });
6910
6988
 
6911
6989
  std::vector<port::Thread> thread_spawning_threads(10);
6912
- for (auto& t: thread_spawning_threads) {
6990
+ for (auto& t : thread_spawning_threads) {
6913
6991
  t = port::Thread([&] {
6914
6992
  while (!done()) {
6915
6993
  {
@@ -6925,7 +7003,7 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) {
6925
7003
  });
6926
7004
  }
6927
7005
 
6928
- for (auto& t: thread_spawning_threads) {
7006
+ for (auto& t : thread_spawning_threads) {
6929
7007
  t.join();
6930
7008
  }
6931
7009
  flushing_thread.join();
@@ -6376,86 +6376,85 @@ TEST_F(DBTest2, AutoPrefixMode1) {
6376
6376
  ReadOptions ro;
6377
6377
  ro.total_order_seek = false;
6378
6378
  ro.auto_prefix_mode = true;
6379
+
6380
+ const auto stat = BLOOM_FILTER_PREFIX_CHECKED;
6379
6381
  {
6380
6382
  std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6381
6383
  iterator->Seek("b1");
6382
6384
  ASSERT_TRUE(iterator->Valid());
6383
6385
  ASSERT_EQ("x1", iterator->key().ToString());
6384
- ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6386
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6385
6387
  ASSERT_OK(iterator->status());
6386
6388
  }
6387
6389
 
6388
- std::string ub_str = "b9";
6389
- Slice ub(ub_str);
6390
+ Slice ub;
6390
6391
  ro.iterate_upper_bound = &ub;
6391
6392
 
6393
+ ub = "b9";
6392
6394
  {
6393
6395
  std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6394
6396
  iterator->Seek("b1");
6395
6397
  ASSERT_FALSE(iterator->Valid());
6396
- ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6398
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6397
6399
  ASSERT_OK(iterator->status());
6398
6400
  }
6399
6401
 
6400
- ub_str = "z";
6401
- ub = Slice(ub_str);
6402
+ ub = "z";
6402
6403
  {
6403
6404
  std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6404
6405
  iterator->Seek("b1");
6405
6406
  ASSERT_TRUE(iterator->Valid());
6406
6407
  ASSERT_EQ("x1", iterator->key().ToString());
6407
- ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6408
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6408
6409
  ASSERT_OK(iterator->status());
6409
6410
  }
6410
6411
 
6411
- ub_str = "c";
6412
- ub = Slice(ub_str);
6412
+ ub = "c";
6413
6413
  {
6414
6414
  std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6415
6415
  iterator->Seek("b1");
6416
6416
  ASSERT_FALSE(iterator->Valid());
6417
- ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6417
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6418
6418
  ASSERT_OK(iterator->status());
6419
6419
  }
6420
6420
 
6421
- // The same queries without recreating iterator
6421
+ ub = "c1";
6422
6422
  {
6423
- ub_str = "b9";
6424
- ub = Slice(ub_str);
6425
- ro.iterate_upper_bound = &ub;
6426
-
6427
6423
  std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6428
6424
  iterator->Seek("b1");
6429
6425
  ASSERT_FALSE(iterator->Valid());
6430
- ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6426
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6431
6427
  ASSERT_OK(iterator->status());
6428
+ }
6432
6429
 
6433
- ub_str = "z";
6434
- ub = Slice(ub_str);
6430
+ // The same queries without recreating iterator
6431
+ {
6432
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6433
+
6434
+ ub = "b9";
6435
+ iterator->Seek("b1");
6436
+ ASSERT_FALSE(iterator->Valid());
6437
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6438
+ ASSERT_OK(iterator->status());
6435
6439
 
6440
+ ub = "z";
6436
6441
  iterator->Seek("b1");
6437
6442
  ASSERT_TRUE(iterator->Valid());
6438
6443
  ASSERT_EQ("x1", iterator->key().ToString());
6439
- ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6440
-
6441
- ub_str = "c";
6442
- ub = Slice(ub_str);
6444
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6443
6445
 
6446
+ ub = "c";
6444
6447
  iterator->Seek("b1");
6445
6448
  ASSERT_FALSE(iterator->Valid());
6446
- ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6449
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6447
6450
 
6448
- ub_str = "b9";
6449
- ub = Slice(ub_str);
6450
- ro.iterate_upper_bound = &ub;
6451
+ ub = "b9";
6451
6452
  iterator->SeekForPrev("b1");
6452
6453
  ASSERT_TRUE(iterator->Valid());
6453
6454
  ASSERT_EQ("a1", iterator->key().ToString());
6454
- ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6455
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6455
6456
 
6456
- ub_str = "zz";
6457
- ub = Slice(ub_str);
6458
- ro.iterate_upper_bound = &ub;
6457
+ ub = "zz";
6459
6458
  iterator->SeekToLast();
6460
6459
  ASSERT_TRUE(iterator->Valid());
6461
6460
  ASSERT_EQ("y1", iterator->key().ToString());
@@ -6464,6 +6463,136 @@ TEST_F(DBTest2, AutoPrefixMode1) {
6464
6463
  ASSERT_TRUE(iterator->Valid());
6465
6464
  ASSERT_EQ("a1", iterator->key().ToString());
6466
6465
  }
6466
+
6467
+ // Similar, now with reverse comparator
6468
+ // Technically, we are violating axiom 2 of prefix_extractors, but
6469
+ // it should be revised because of major use-cases using
6470
+ // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME)
6471
+ options.comparator = ReverseBytewiseComparator();
6472
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6473
+
6474
+ DestroyAndReopen(options);
6475
+
6476
+ ASSERT_OK(Put("a1", large_value));
6477
+ ASSERT_OK(Put("x1", large_value));
6478
+ ASSERT_OK(Put("y1", large_value));
6479
+ ASSERT_OK(Flush());
6480
+
6481
+ {
6482
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6483
+
6484
+ ub = "b1";
6485
+ iterator->Seek("b9");
6486
+ ASSERT_FALSE(iterator->Valid());
6487
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6488
+ ASSERT_OK(iterator->status());
6489
+
6490
+ ub = "b1";
6491
+ iterator->Seek("z");
6492
+ ASSERT_TRUE(iterator->Valid());
6493
+ ASSERT_EQ("y1", iterator->key().ToString());
6494
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6495
+
6496
+ ub = "b1";
6497
+ iterator->Seek("c");
6498
+ ASSERT_FALSE(iterator->Valid());
6499
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6500
+
6501
+ ub = "b";
6502
+ iterator->Seek("c9");
6503
+ ASSERT_FALSE(iterator->Valid());
6504
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
6505
+ // is "correctly" implemented.
6506
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6507
+
6508
+ ub = "a";
6509
+ iterator->Seek("b9");
6510
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
6511
+ // is "correctly" implemented.
6512
+ ASSERT_TRUE(iterator->Valid());
6513
+ ASSERT_EQ("a1", iterator->key().ToString());
6514
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6515
+
6516
+ ub = "b";
6517
+ iterator->Seek("a");
6518
+ ASSERT_FALSE(iterator->Valid());
6519
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
6520
+ // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper
6521
+ // comparing before seek key prevents a real bug from surfacing.
6522
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6523
+
6524
+ ub = "b1";
6525
+ iterator->SeekForPrev("b9");
6526
+ ASSERT_TRUE(iterator->Valid());
6527
+ // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
6528
+ // is "correctly" implemented.
6529
+ ASSERT_EQ("x1", iterator->key().ToString());
6530
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6531
+
6532
+ ub = "a";
6533
+ iterator->SeekToLast();
6534
+ ASSERT_TRUE(iterator->Valid());
6535
+ ASSERT_EQ("a1", iterator->key().ToString());
6536
+
6537
+ iterator->SeekToFirst();
6538
+ ASSERT_TRUE(iterator->Valid());
6539
+ ASSERT_EQ("y1", iterator->key().ToString());
6540
+ }
6541
+
6542
+ // Now something a bit different, related to "short" keys that
6543
+ // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode.
6544
+ options.comparator = BytewiseComparator();
6545
+ for (const auto config : {"fixed:2", "capped:2"}) {
6546
+ ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config,
6547
+ &options.prefix_extractor));
6548
+
6549
+ // FIXME: kHashSearch, etc. requires all keys be InDomain
6550
+ if (StartsWith(config, "fixed") &&
6551
+ (table_options.index_type == BlockBasedTableOptions::kHashSearch ||
6552
+ StartsWith(options.memtable_factory->Name(), "Hash"))) {
6553
+ continue;
6554
+ }
6555
+ DestroyAndReopen(options);
6556
+
6557
+ const char* a_end_stuff = "a\xffXYZ";
6558
+ const char* b_begin_stuff = "b\x00XYZ";
6559
+ ASSERT_OK(Put("a", large_value));
6560
+ ASSERT_OK(Put("b", large_value));
6561
+ ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value));
6562
+ ASSERT_OK(Put("c", large_value));
6563
+ ASSERT_OK(Flush());
6564
+
6565
+ // control showing valid optimization with auto_prefix mode
6566
+ ub = Slice(a_end_stuff, 4);
6567
+ ro.iterate_upper_bound = &ub;
6568
+
6569
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6570
+ iterator->Seek(Slice(a_end_stuff, 2));
6571
+ ASSERT_FALSE(iterator->Valid());
6572
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6573
+ ASSERT_OK(iterator->status());
6574
+
6575
+ // test, cannot be validly optimized with auto_prefix_mode
6576
+ ub = Slice(b_begin_stuff, 2);
6577
+ ro.iterate_upper_bound = &ub;
6578
+
6579
+ iterator->Seek(Slice(a_end_stuff, 2));
6580
+ // !!! BUG !!! See "BUG" section of auto_prefix_mode.
6581
+ ASSERT_FALSE(iterator->Valid());
6582
+ EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
6583
+ ASSERT_OK(iterator->status());
6584
+
6585
+ // To prove that is the wrong result, now use total order seek
6586
+ ReadOptions tos_ro = ro;
6587
+ tos_ro.total_order_seek = true;
6588
+ tos_ro.auto_prefix_mode = false;
6589
+ iterator.reset(db_->NewIterator(tos_ro));
6590
+ iterator->Seek(Slice(a_end_stuff, 2));
6591
+ ASSERT_TRUE(iterator->Valid());
6592
+ ASSERT_EQ("b", iterator->key().ToString());
6593
+ EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
6594
+ ASSERT_OK(iterator->status());
6595
+ }
6467
6596
  } while (ChangeOptions(kSkipPlainTable));
6468
6597
  }
6469
6598
 
@@ -1744,5 +1744,6 @@ template class TargetCacheChargeTrackingCache<
1744
1744
  CacheEntryRole::kFilterConstruction>;
1745
1745
  template class TargetCacheChargeTrackingCache<
1746
1746
  CacheEntryRole::kBlockBasedTableReader>;
1747
+ template class TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>;
1747
1748
 
1748
1749
  } // namespace ROCKSDB_NAMESPACE
@@ -320,7 +320,7 @@ class InternalKey {
320
320
  }
321
321
 
322
322
  Slice user_key() const { return ExtractUserKey(rep_); }
323
- size_t size() { return rep_.size(); }
323
+ size_t size() const { return rep_.size(); }
324
324
 
325
325
  void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
326
326
  SetFrom(ParsedInternalKey(_user_key, s, t));
@@ -23,6 +23,7 @@
23
23
  #include <utility>
24
24
  #include <vector>
25
25
 
26
+ #include "cache/cache_reservation_manager.h"
26
27
  #include "db/blob/blob_file_meta.h"
27
28
  #include "db/dbformat.h"
28
29
  #include "db/internal_stats.h"
@@ -255,10 +256,13 @@ class VersionBuilder::Rep {
255
256
  // version edits.
256
257
  std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_;
257
258
 
259
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
260
+
258
261
  public:
259
262
  Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
260
263
  TableCache* table_cache, VersionStorageInfo* base_vstorage,
261
- VersionSet* version_set)
264
+ VersionSet* version_set,
265
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
262
266
  : file_options_(file_options),
263
267
  ioptions_(ioptions),
264
268
  table_cache_(table_cache),
@@ -266,7 +270,8 @@ class VersionBuilder::Rep {
266
270
  version_set_(version_set),
267
271
  num_levels_(base_vstorage->num_levels()),
268
272
  has_invalid_levels_(false),
269
- level_nonzero_cmp_(base_vstorage_->InternalComparator()) {
273
+ level_nonzero_cmp_(base_vstorage_->InternalComparator()),
274
+ file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
270
275
  assert(ioptions_);
271
276
 
272
277
  levels_ = new LevelState[num_levels_];
@@ -291,6 +296,12 @@ class VersionBuilder::Rep {
291
296
  table_cache_->ReleaseHandle(f->table_reader_handle);
292
297
  f->table_reader_handle = nullptr;
293
298
  }
299
+
300
+ if (file_metadata_cache_res_mgr_) {
301
+ Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
302
+ f->ApproximateMemoryUsage(), false /* increase */);
303
+ s.PermitUncheckedError();
304
+ }
294
305
  delete f;
295
306
  }
296
307
  }
@@ -763,6 +774,22 @@ class VersionBuilder::Rep {
763
774
  FileMetaData* const f = new FileMetaData(meta);
764
775
  f->refs = 1;
765
776
 
777
+ if (file_metadata_cache_res_mgr_) {
778
+ Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
779
+ f->ApproximateMemoryUsage(), true /* increase */);
780
+ if (!s.ok()) {
781
+ delete f;
782
+ s = Status::MemoryLimit(
783
+ "Can't allocate " +
784
+ kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
785
+ CacheEntryRole::kFileMetadata)] +
786
+ " due to exceeding the memory limit "
787
+ "based on "
788
+ "cache capacity");
789
+ return s;
790
+ }
791
+ }
792
+
766
793
  auto& add_files = level_state.added_files;
767
794
  assert(add_files.find(file_number) == add_files.end());
768
795
  add_files.emplace(file_number, f);
@@ -1239,13 +1266,13 @@ class VersionBuilder::Rep {
1239
1266
  }
1240
1267
  };
1241
1268
 
1242
- VersionBuilder::VersionBuilder(const FileOptions& file_options,
1243
- const ImmutableCFOptions* ioptions,
1244
- TableCache* table_cache,
1245
- VersionStorageInfo* base_vstorage,
1246
- VersionSet* version_set)
1269
+ VersionBuilder::VersionBuilder(
1270
+ const FileOptions& file_options, const ImmutableCFOptions* ioptions,
1271
+ TableCache* table_cache, VersionStorageInfo* base_vstorage,
1272
+ VersionSet* version_set,
1273
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
1247
1274
  : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
1248
- version_set)) {}
1275
+ version_set, file_metadata_cache_res_mgr)) {}
1249
1276
 
1250
1277
  VersionBuilder::~VersionBuilder() = default;
1251
1278
 
@@ -1280,7 +1307,8 @@ BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
1280
1307
  : version_builder_(new VersionBuilder(
1281
1308
  cfd->current()->version_set()->file_options(), cfd->ioptions(),
1282
1309
  cfd->table_cache(), cfd->current()->storage_info(),
1283
- cfd->current()->version_set())),
1310
+ cfd->current()->version_set(),
1311
+ cfd->GetFileMetadataCacheReservationManager())),
1284
1312
  version_(cfd->current()) {
1285
1313
  version_->Ref();
1286
1314
  }
@@ -1289,7 +1317,8 @@ BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
1289
1317
  ColumnFamilyData* cfd, Version* v)
1290
1318
  : version_builder_(new VersionBuilder(
1291
1319
  cfd->current()->version_set()->file_options(), cfd->ioptions(),
1292
- cfd->table_cache(), v->storage_info(), v->version_set())),
1320
+ cfd->table_cache(), v->storage_info(), v->version_set(),
1321
+ cfd->GetFileMetadataCacheReservationManager())),
1293
1322
  version_(v) {
1294
1323
  assert(version_ != cfd->current());
1295
1324
  }
@@ -25,6 +25,7 @@ class InternalStats;
25
25
  class Version;
26
26
  class VersionSet;
27
27
  class ColumnFamilyData;
28
+ class CacheReservationManager;
28
29
 
29
30
  // A helper class so we can efficiently apply a whole sequence
30
31
  // of edits to a particular state without creating intermediate
@@ -33,7 +34,9 @@ class VersionBuilder {
33
34
  public:
34
35
  VersionBuilder(const FileOptions& file_options,
35
36
  const ImmutableCFOptions* ioptions, TableCache* table_cache,
36
- VersionStorageInfo* base_vstorage, VersionSet* version_set);
37
+ VersionStorageInfo* base_vstorage, VersionSet* version_set,
38
+ std::shared_ptr<CacheReservationManager>
39
+ file_metadata_cache_res_mgr = nullptr);
37
40
  ~VersionBuilder();
38
41
 
39
42
  bool CheckConsistencyForNumLevels();
@@ -19,6 +19,7 @@
19
19
  #include "db/dbformat.h"
20
20
  #include "db/wal_edit.h"
21
21
  #include "memory/arena.h"
22
+ #include "port/malloc.h"
22
23
  #include "rocksdb/advanced_options.h"
23
24
  #include "rocksdb/cache.h"
24
25
  #include "table/table_reader.h"
@@ -293,6 +294,25 @@ struct FileMetaData {
293
294
  }
294
295
  return kUnknownFileCreationTime;
295
296
  }
297
+
298
+ // WARNING: manual update to this function is needed
299
+ // whenever a new string property is added to FileMetaData
300
+ // to reduce approximation error.
301
+ //
302
+ // TODO: eliminate the need of manually updating this function
303
+ // for new string properties
304
+ size_t ApproximateMemoryUsage() const {
305
+ size_t usage = 0;
306
+ #ifdef ROCKSDB_MALLOC_USABLE_SIZE
307
+ usage += malloc_usable_size(const_cast<FileMetaData*>(this));
308
+ #else
309
+ usage += sizeof(*this);
310
+ #endif // ROCKSDB_MALLOC_USABLE_SIZE
311
+ usage += smallest.size() + largest.size() + file_checksum.size() +
312
+ file_checksum_func_name.size() + min_timestamp.size() +
313
+ max_timestamp.size();
314
+ return usage;
315
+ }
296
316
  };
297
317
 
298
318
  // A compressed copy of file meta data that just contain minimum data needed
@@ -775,7 +775,8 @@ Version::~Version() {
775
775
  uint32_t path_id = f->fd.GetPathId();
776
776
  assert(path_id < cfd_->ioptions()->cf_paths.size());
777
777
  vset_->obsolete_files_.push_back(
778
- ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path));
778
+ ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path,
779
+ cfd_->GetFileMetadataCacheReservationManager()));
779
780
  }
780
781
  }
781
782
  }