@nxtedition/rocksdb 7.0.0 → 7.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/binding.cc +38 -40
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +1 -1
  3. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +3 -1
  4. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +28 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_test.cc +5 -2
  8. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +48 -60
  9. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +18 -20
  10. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  11. package/deps/rocksdb/rocksdb/db/c.cc +5 -0
  12. package/deps/rocksdb/rocksdb/db/column_family.cc +20 -0
  13. package/deps/rocksdb/rocksdb/db/column_family.h +9 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +44 -26
  15. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +32 -14
  16. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +73 -44
  17. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +3 -1
  18. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +6 -1
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +10 -5
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +47 -35
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -1
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +54 -32
  23. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +426 -61
  24. package/deps/rocksdb/rocksdb/db/db_options_test.cc +1 -0
  25. package/deps/rocksdb/rocksdb/db/db_test.cc +102 -24
  26. package/deps/rocksdb/rocksdb/db/db_test2.cc +159 -30
  27. package/deps/rocksdb/rocksdb/db/db_test_util.cc +1 -0
  28. package/deps/rocksdb/rocksdb/db/dbformat.h +1 -1
  29. package/deps/rocksdb/rocksdb/db/version_builder.cc +39 -10
  30. package/deps/rocksdb/rocksdb/db/version_builder.h +4 -1
  31. package/deps/rocksdb/rocksdb/db/version_edit.h +20 -0
  32. package/deps/rocksdb/rocksdb/db/version_set.cc +2 -1
  33. package/deps/rocksdb/rocksdb/db/version_set.h +17 -2
  34. package/deps/rocksdb/rocksdb/db/version_set_test.cc +119 -0
  35. package/deps/rocksdb/rocksdb/db/write_batch.cc +96 -0
  36. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -0
  37. package/deps/rocksdb/rocksdb/db/write_thread.cc +1 -0
  38. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -0
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
  40. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +18 -2
  41. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -0
  42. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +12 -0
  43. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +1 -1
  44. package/deps/rocksdb/rocksdb/env/fs_posix.cc +96 -6
  45. package/deps/rocksdb/rocksdb/env/io_posix.cc +51 -18
  46. package/deps/rocksdb/rocksdb/env/io_posix.h +2 -0
  47. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +12 -5
  48. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +22 -6
  49. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +99 -8
  50. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +9 -1
  51. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +3 -0
  52. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -0
  53. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +4 -0
  54. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +1 -1
  55. package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +7 -0
  56. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +11 -1
  57. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +4 -1
  58. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +14 -1
  59. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +6 -0
  60. package/deps/rocksdb/rocksdb/options/cf_options.cc +12 -1
  61. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  62. package/deps/rocksdb/rocksdb/options/options.cc +8 -1
  63. package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
  64. package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -1
  65. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +7 -2
  66. package/deps/rocksdb/rocksdb/options/options_test.cc +52 -0
  67. package/deps/rocksdb/rocksdb/port/port_posix.h +10 -1
  68. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +1 -1
  69. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +1 -1
  70. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  71. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +5 -5
  72. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -10
  73. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +1 -1
  74. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +1 -1
  75. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  76. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  77. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +39 -12
  78. package/deps/rocksdb/rocksdb/util/comparator.cc +10 -0
  79. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +1 -1
  80. package/deps/rocksdb/rocksdb/util/xxhash.h +2 -1
  81. package/index.js +2 -2
  82. package/package.json +1 -1
  83. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  84. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -699,8 +699,13 @@ struct ObsoleteFileInfo {
699
699
 
700
700
  ObsoleteFileInfo() noexcept
701
701
  : metadata(nullptr), only_delete_metadata(false) {}
702
- ObsoleteFileInfo(FileMetaData* f, const std::string& file_path)
703
- : metadata(f), path(file_path), only_delete_metadata(false) {}
702
+ ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
703
+ std::shared_ptr<CacheReservationManager>
704
+ file_metadata_cache_res_mgr_arg = nullptr)
705
+ : metadata(f),
706
+ path(file_path),
707
+ only_delete_metadata(false),
708
+ file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {}
704
709
 
705
710
  ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
706
711
  ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
@@ -713,13 +718,23 @@ struct ObsoleteFileInfo {
713
718
  path = std::move(rhs.path);
714
719
  metadata = rhs.metadata;
715
720
  rhs.metadata = nullptr;
721
+ file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
722
+ rhs.file_metadata_cache_res_mgr = nullptr;
716
723
 
717
724
  return *this;
718
725
  }
719
726
  void DeleteMetadata() {
727
+ if (file_metadata_cache_res_mgr) {
728
+ Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
729
+ metadata->ApproximateMemoryUsage(), false /* increase */);
730
+ s.PermitUncheckedError();
731
+ }
720
732
  delete metadata;
721
733
  metadata = nullptr;
722
734
  }
735
+
736
+ private:
737
+ std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
723
738
  };
724
739
 
725
740
  class ObsoleteBlobFileInfo {
@@ -12,6 +12,7 @@
12
12
  #include <algorithm>
13
13
 
14
14
  #include "db/db_impl/db_impl.h"
15
+ #include "db/db_test_util.h"
15
16
  #include "db/log_writer.h"
16
17
  #include "rocksdb/advanced_options.h"
17
18
  #include "rocksdb/convenience.h"
@@ -3446,6 +3447,124 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
3446
3447
  }
3447
3448
  }
3448
3449
 
3450
+ class ChargeFileMetadataTest : public DBTestBase {
3451
+ public:
3452
+ ChargeFileMetadataTest()
3453
+ : DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {}
3454
+ };
3455
+
3456
+ class ChargeFileMetadataTestWithParam
3457
+ : public ChargeFileMetadataTest,
3458
+ public testing::WithParamInterface<CacheEntryRoleOptions::Decision> {
3459
+ public:
3460
+ ChargeFileMetadataTestWithParam() {}
3461
+ };
3462
+
3463
+ #ifndef ROCKSDB_LITE
3464
+ INSTANTIATE_TEST_CASE_P(
3465
+ ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam,
3466
+ ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
3467
+ CacheEntryRoleOptions::Decision::kDisabled));
3468
+
3469
+ TEST_P(ChargeFileMetadataTestWithParam, Basic) {
3470
+ Options options;
3471
+ BlockBasedTableOptions table_options;
3472
+ CacheEntryRoleOptions::Decision charge_file_metadata = GetParam();
3473
+ table_options.cache_usage_options.options_overrides.insert(
3474
+ {CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}});
3475
+ std::shared_ptr<TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>
3476
+ file_metadata_charge_only_cache = std::make_shared<
3477
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
3478
+ NewLRUCache(
3479
+ 4 * CacheReservationManagerImpl<
3480
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
3481
+ 0 /* num_shard_bits */, true /* strict_capacity_limit */));
3482
+ table_options.block_cache = file_metadata_charge_only_cache;
3483
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3484
+ options.create_if_missing = true;
3485
+ options.disable_auto_compactions = true;
3486
+ DestroyAndReopen(options);
3487
+
3488
+ // Create 128 file metadata, each of which is roughly 1024 bytes.
3489
+ // This results in 1 *
3490
+ // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
3491
+ // cache reservation for file metadata.
3492
+ for (int i = 1; i <= 128; ++i) {
3493
+ ASSERT_OK(Put(std::string(1024, 'a'), "va"));
3494
+ ASSERT_OK(Put("b", "vb"));
3495
+ ASSERT_OK(Flush());
3496
+ }
3497
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
3498
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
3499
+ 1 * CacheReservationManagerImpl<
3500
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
3501
+
3502
+ } else {
3503
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
3504
+ }
3505
+
3506
+ // Create another 128 file metadata.
3507
+ // This increases the file metadata cache reservation to 2 *
3508
+ // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize().
3509
+ for (int i = 1; i <= 128; ++i) {
3510
+ ASSERT_OK(Put(std::string(1024, 'a'), "vva"));
3511
+ ASSERT_OK(Put("b", "vvb"));
3512
+ ASSERT_OK(Flush());
3513
+ }
3514
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
3515
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
3516
+ 2 * CacheReservationManagerImpl<
3517
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
3518
+ } else {
3519
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
3520
+ }
3521
+ // Compaction will create 1 new file metadata, obsolete and delete all 256
3522
+ // file metadata above. This results in 1 *
3523
+ // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
3524
+ // cache reservation for file metadata.
3525
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
3526
+ ASSERT_EQ("0,1", FilesPerLevel(0));
3527
+
3528
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
3529
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
3530
+ 1 * CacheReservationManagerImpl<
3531
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
3532
+ } else {
3533
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
3534
+ }
3535
+
3536
+ // Destroying the db will delete the remaining 1 new file metadata
3537
+ // This results in no cache reservation for file metadata.
3538
+ Destroy(options);
3539
+ EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
3540
+ 0 * CacheReservationManagerImpl<
3541
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
3542
+
3543
+ // Reopen the db with a smaller cache in order to test failure in allocating
3544
+ // file metadata due to memory limit based on cache capacity
3545
+ file_metadata_charge_only_cache = std::make_shared<
3546
+ TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
3547
+ NewLRUCache(1 * CacheReservationManagerImpl<
3548
+ CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
3549
+ 0 /* num_shard_bits */, true /* strict_capacity_limit */));
3550
+ table_options.block_cache = file_metadata_charge_only_cache;
3551
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3552
+ Reopen(options);
3553
+ ASSERT_OK(Put(std::string(1024, 'a'), "va"));
3554
+ ASSERT_OK(Put("b", "vb"));
3555
+ Status s = Flush();
3556
+ if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
3557
+ EXPECT_TRUE(s.IsMemoryLimit());
3558
+ EXPECT_TRUE(s.ToString().find(
3559
+ kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
3560
+ CacheEntryRole::kFileMetadata)]) != std::string::npos);
3561
+ EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
3562
+ std::string::npos);
3563
+ } else {
3564
+ EXPECT_TRUE(s.ok());
3565
+ }
3566
+ }
3567
+ #endif // ROCKSDB_LITE
3449
3568
  } // namespace ROCKSDB_NAMESPACE
3450
3569
 
3451
3570
  int main(int argc, char** argv) {
@@ -1491,6 +1491,94 @@ Status WriteBatch::UpdateTimestamps(
1491
1491
  return s;
1492
1492
  }
1493
1493
 
1494
+ Status WriteBatch::VerifyChecksum() const {
1495
+ if (prot_info_ == nullptr) {
1496
+ return Status::OK();
1497
+ }
1498
+ Slice input(rep_.data() + WriteBatchInternal::kHeader,
1499
+ rep_.size() - WriteBatchInternal::kHeader);
1500
+ Slice key, value, blob, xid;
1501
+ char tag = 0;
1502
+ uint32_t column_family = 0; // default
1503
+ Status s;
1504
+ size_t prot_info_idx = 0;
1505
+ bool checksum_protected = true;
1506
+ while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) {
1507
+ // In case key/value/column_family are not updated by
1508
+ // ReadRecordFromWriteBatch
1509
+ key.clear();
1510
+ value.clear();
1511
+ column_family = 0;
1512
+ s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
1513
+ &blob, &xid);
1514
+ if (!s.ok()) {
1515
+ return s;
1516
+ }
1517
+ checksum_protected = true;
1518
+ // Write batch checksum uses op_type without ColumnFamily (e.g., if op_type
1519
+ // in the write batch is kTypeColumnFamilyValue, kTypeValue is used to
1520
+ // compute the checksum), and encodes column family id separately. See
1521
+ // comment in first `WriteBatchInternal::Put()` for more detail.
1522
+ switch (tag) {
1523
+ case kTypeColumnFamilyValue:
1524
+ case kTypeValue:
1525
+ tag = kTypeValue;
1526
+ break;
1527
+ case kTypeColumnFamilyDeletion:
1528
+ case kTypeDeletion:
1529
+ tag = kTypeDeletion;
1530
+ break;
1531
+ case kTypeColumnFamilySingleDeletion:
1532
+ case kTypeSingleDeletion:
1533
+ tag = kTypeSingleDeletion;
1534
+ break;
1535
+ case kTypeColumnFamilyRangeDeletion:
1536
+ case kTypeRangeDeletion:
1537
+ tag = kTypeRangeDeletion;
1538
+ break;
1539
+ case kTypeColumnFamilyMerge:
1540
+ case kTypeMerge:
1541
+ tag = kTypeMerge;
1542
+ break;
1543
+ case kTypeColumnFamilyBlobIndex:
1544
+ case kTypeBlobIndex:
1545
+ tag = kTypeBlobIndex;
1546
+ break;
1547
+ case kTypeLogData:
1548
+ case kTypeBeginPrepareXID:
1549
+ case kTypeEndPrepareXID:
1550
+ case kTypeCommitXID:
1551
+ case kTypeRollbackXID:
1552
+ case kTypeNoop:
1553
+ case kTypeBeginPersistedPrepareXID:
1554
+ case kTypeBeginUnprepareXID:
1555
+ case kTypeDeletionWithTimestamp:
1556
+ case kTypeCommitXIDAndTimestamp:
1557
+ checksum_protected = false;
1558
+ break;
1559
+ default:
1560
+ return Status::Corruption(
1561
+ "unknown WriteBatch tag",
1562
+ std::to_string(static_cast<unsigned int>(tag)));
1563
+ }
1564
+ if (checksum_protected) {
1565
+ s = prot_info_->entries_[prot_info_idx++]
1566
+ .StripC(column_family)
1567
+ .StripKVO(key, value, static_cast<ValueType>(tag))
1568
+ .GetStatus();
1569
+ if (!s.ok()) {
1570
+ return s;
1571
+ }
1572
+ }
1573
+ }
1574
+
1575
+ if (prot_info_idx != WriteBatchInternal::Count(this)) {
1576
+ return Status::Corruption("WriteBatch has wrong count");
1577
+ }
1578
+ assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size());
1579
+ return Status::OK();
1580
+ }
1581
+
1494
1582
  namespace {
1495
1583
 
1496
1584
  class MemTableInserter : public WriteBatch::Handler {
@@ -2773,6 +2861,14 @@ Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
2773
2861
  const bool wal_only) {
2774
2862
  assert(dst->Count() == 0 ||
2775
2863
  (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
2864
+ if ((src->prot_info_ != nullptr &&
2865
+ src->prot_info_->entries_.size() != src->Count()) ||
2866
+ (dst->prot_info_ != nullptr &&
2867
+ dst->prot_info_->entries_.size() != dst->Count())) {
2868
+ return Status::Corruption(
2869
+ "Write batch has inconsistent count and number of checksums");
2870
+ }
2871
+
2776
2872
  size_t src_len;
2777
2873
  int src_count;
2778
2874
  uint32_t src_flags;
@@ -206,6 +206,10 @@ class WriteBatchInternal {
206
206
  bool batch_per_txn = true,
207
207
  bool hint_per_batch = false);
208
208
 
209
+ // Appends src write batch to dst write batch and updates count in dst
210
+ // write batch. Returns OK if the append is successful. Checks number of
211
+ // checksum against count in dst and src write batches, and returns Corruption
212
+ // if the count is inconsistent.
209
213
  static Status Append(WriteBatch* dst, const WriteBatch* src,
210
214
  const bool WAL_only = false);
211
215
 
@@ -389,6 +389,7 @@ void WriteThread::JoinBatchGroup(Writer* w) {
389
389
  }
390
390
 
391
391
  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
392
+ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait2", w);
392
393
 
393
394
  if (!linked_as_leader) {
394
395
  /**
@@ -139,6 +139,7 @@ DECLARE_bool(cache_index_and_filter_blocks);
139
139
  DECLARE_bool(charge_compression_dictionary_building_buffer);
140
140
  DECLARE_bool(charge_filter_construction);
141
141
  DECLARE_bool(charge_table_reader);
142
+ DECLARE_bool(charge_file_metadata);
142
143
  DECLARE_int32(top_level_index_pinning);
143
144
  DECLARE_int32(partition_pinning);
144
145
  DECLARE_int32(unpartitioned_pinning);
@@ -297,6 +298,8 @@ DECLARE_bool(verify_sst_unique_id_in_manifest);
297
298
 
298
299
  DECLARE_int32(create_timestamped_snapshot_one_in);
299
300
 
301
+ DECLARE_bool(allow_data_in_errors);
302
+
300
303
  constexpr long KB = 1024;
301
304
  constexpr int kRandomValueMaxFactor = 3;
302
305
  constexpr int kValueMaxLen = 100;
@@ -325,6 +325,11 @@ DEFINE_bool(charge_table_reader, false,
325
325
  "CacheEntryRoleOptions::charged of"
326
326
  "CacheEntryRole::kBlockBasedTableReader");
327
327
 
328
+ DEFINE_bool(charge_file_metadata, false,
329
+ "Setting for "
330
+ "CacheEntryRoleOptions::charged of"
331
+ "kFileMetadata");
332
+
328
333
  DEFINE_int32(
329
334
  top_level_index_pinning,
330
335
  static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
@@ -971,4 +976,8 @@ DEFINE_int32(
971
976
  create_timestamped_snapshot_one_in, 0,
972
977
  "On non-zero, create timestamped snapshots upon transaction commits.");
973
978
 
979
+ DEFINE_bool(allow_data_in_errors,
980
+ ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
981
+ "If true, allow logging data, e.g. key, value in LOG files.");
982
+
974
983
  #endif // GFLAGS
@@ -66,6 +66,7 @@ StressTest::StressTest()
66
66
  #ifndef ROCKSDB_LITE
67
67
  txn_db_(nullptr),
68
68
  #endif
69
+ db_aptr_(nullptr),
69
70
  clock_(db_stress_env->GetSystemClock().get()),
70
71
  new_column_family_name_(1),
71
72
  num_times_reopened_(0),
@@ -129,7 +130,9 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
129
130
  }
130
131
  return cache;
131
132
  } else if (FLAGS_cache_type == "fast_lru_cache") {
132
- return NewFastLRUCache((size_t)capacity, num_shard_bits);
133
+ return NewFastLRUCache(static_cast<size_t>(capacity), FLAGS_block_size,
134
+ num_shard_bits, false /*strict_capacity_limit*/,
135
+ kDefaultCacheMetadataChargePolicy);
133
136
  } else if (FLAGS_cache_type == "lru_cache") {
134
137
  LRUCacheOptions opts;
135
138
  opts.capacity = capacity;
@@ -2541,7 +2544,13 @@ void StressTest::Open(SharedState* shared) {
2541
2544
  fflush(stderr);
2542
2545
  }
2543
2546
  assert(s.ok());
2544
- db_ = txn_db_;
2547
+
2548
+ // Do not swap the order of the following.
2549
+ {
2550
+ db_ = txn_db_;
2551
+ db_aptr_.store(txn_db_, std::memory_order_release);
2552
+ }
2553
+
2545
2554
  // after a crash, rollback to commit recovered transactions
2546
2555
  std::vector<Transaction*> trans;
2547
2556
  txn_db_->GetAllPreparedTransactions(&trans);
@@ -2757,6 +2766,11 @@ void InitializeOptionsFromFlags(
2757
2766
  {/*.charged = */ FLAGS_charge_table_reader
2758
2767
  ? CacheEntryRoleOptions::Decision::kEnabled
2759
2768
  : CacheEntryRoleOptions::Decision::kDisabled}});
2769
+ block_based_options.cache_usage_options.options_overrides.insert(
2770
+ {CacheEntryRole::kFileMetadata,
2771
+ {/*.charged = */ FLAGS_charge_file_metadata
2772
+ ? CacheEntryRoleOptions::Decision::kEnabled
2773
+ : CacheEntryRoleOptions::Decision::kDisabled}});
2760
2774
  block_based_options.format_version =
2761
2775
  static_cast<uint32_t>(FLAGS_format_version);
2762
2776
  block_based_options.index_block_restart_interval =
@@ -2917,6 +2931,8 @@ void InitializeOptionsFromFlags(
2917
2931
  if (FLAGS_user_timestamp_size > 0) {
2918
2932
  CheckAndSetOptionsForUserTimestamp(options);
2919
2933
  }
2934
+
2935
+ options.allow_data_in_errors = FLAGS_allow_data_in_errors;
2920
2936
  }
2921
2937
 
2922
2938
  void InitializeOptionsGeneral(
@@ -236,6 +236,10 @@ class StressTest {
236
236
  #ifndef ROCKSDB_LITE
237
237
  TransactionDB* txn_db_;
238
238
  #endif
239
+
240
+ // Currently only used in MultiOpsTxnsStressTest
241
+ std::atomic<DB*> db_aptr_;
242
+
239
243
  Options options_;
240
244
  SystemClock* clock_;
241
245
  std::vector<ColumnFamilyHandle*> column_families_;
@@ -1248,7 +1248,19 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const {
1248
1248
  }
1249
1249
  }
1250
1250
 
1251
+ // VerifyPkSkFast() can be called by MultiOpsTxnsStressListener's callbacks
1252
+ // which can be called before TransactionDB::Open() returns to caller.
1253
+ // Therefore, at that time, db_ and txn_db_ may still be nullptr.
1254
+ // Caller has to make sure that the race condition does not happen.
1251
1255
  void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) {
1256
+ DB* const db = db_aptr_.load(std::memory_order_acquire);
1257
+ if (db == nullptr) {
1258
+ return;
1259
+ }
1260
+
1261
+ assert(db_ == db);
1262
+ assert(db_ != nullptr);
1263
+
1252
1264
  const Snapshot* const snapshot = db_->GetSnapshot();
1253
1265
  assert(snapshot);
1254
1266
  ManagedSnapshot snapshot_guard(db_, snapshot);
@@ -9,8 +9,8 @@
9
9
 
10
10
  #ifdef GFLAGS
11
11
  #include "db_stress_tool/db_stress_common.h"
12
- #include "utilities/fault_injection_fs.h"
13
12
  #include "rocksdb/utilities/transaction_db.h"
13
+ #include "utilities/fault_injection_fs.h"
14
14
 
15
15
  namespace ROCKSDB_NAMESPACE {
16
16
  class NonBatchedOpsStressTest : public StressTest {
@@ -1124,17 +1124,107 @@ class PosixFileSystem : public FileSystem {
1124
1124
  #endif
1125
1125
  }
1126
1126
 
1127
- // TODO akanksha: Look into flags and see how to provide support for AbortIO
1128
- // in posix for IOUring requests. Currently it calls Poll to wait for requests
1129
- // to complete the request.
1130
1127
  virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
1131
- IOStatus s = Poll(io_handles, io_handles.size());
1128
+ #if defined(ROCKSDB_IOURING_PRESENT)
1129
+ // io_uring_queue_init.
1130
+ struct io_uring* iu = nullptr;
1131
+ if (thread_local_io_urings_) {
1132
+ iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
1133
+ }
1134
+
1135
+ // Init failed, platform doesn't support io_uring.
1132
1136
  // If Poll is not supported then it didn't submit any request and it should
1133
1137
  // return OK.
1134
- if (s.IsNotSupported()) {
1138
+ if (iu == nullptr) {
1135
1139
  return IOStatus::OK();
1136
1140
  }
1137
- return s;
1141
+
1142
+ for (size_t i = 0; i < io_handles.size(); i++) {
1143
+ Posix_IOHandle* posix_handle =
1144
+ static_cast<Posix_IOHandle*>(io_handles[i]);
1145
+ if (posix_handle->is_finished == true) {
1146
+ continue;
1147
+ }
1148
+ assert(posix_handle->iu == iu);
1149
+ if (posix_handle->iu != iu) {
1150
+ return IOStatus::IOError("");
1151
+ }
1152
+
1153
+ // Prepare the cancel request.
1154
+ struct io_uring_sqe* sqe;
1155
+ sqe = io_uring_get_sqe(iu);
1156
+ // prep_cancel changed API in liburing, but we need to support both old
1157
+ // and new versions so do it by hand
1158
+ io_uring_prep_cancel(sqe, 0, 0);
1159
+ sqe->addr = reinterpret_cast<uint64_t>(posix_handle);
1160
+ io_uring_sqe_set_data(sqe, posix_handle);
1161
+
1162
+ // submit the request.
1163
+ ssize_t ret = io_uring_submit(iu);
1164
+ if (ret < 0) {
1165
+ fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
1166
+ return IOStatus::IOError("io_uring_submit() requested but returned " +
1167
+ std::to_string(ret));
1168
+ }
1169
+ }
1170
+
1171
+ // After submitting the requests, wait for the requests.
1172
+ for (size_t i = 0; i < io_handles.size(); i++) {
1173
+ if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
1174
+ continue;
1175
+ }
1176
+
1177
+ while (true) {
1178
+ struct io_uring_cqe* cqe = nullptr;
1179
+ ssize_t ret = io_uring_wait_cqe(iu, &cqe);
1180
+ if (ret) {
1181
+ // abort as it shouldn't be in indeterminate state and there is no
1182
+ // good way currently to handle this error.
1183
+ abort();
1184
+ }
1185
+ assert(cqe != nullptr);
1186
+
1187
+ Posix_IOHandle* posix_handle =
1188
+ static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
1189
+ assert(posix_handle->iu == iu);
1190
+ if (posix_handle->iu != iu) {
1191
+ return IOStatus::IOError("");
1192
+ }
1193
+ posix_handle->req_count++;
1194
+
1195
+ // Reset cqe data to catch any stray reuse of it
1196
+ static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
1197
+ io_uring_cqe_seen(iu, cqe);
1198
+
1199
+ // - If the request is cancelled successfully, the original request is
1200
+ // completed with -ECANCELED and the cancel request is completed with
1201
+ // a result of 0.
1202
+ // - If the request was already running, the original may or
1203
+ // may not complete in error. The cancel request will complete with
1204
+ // -EALREADY for that case.
1205
+ // - And finally, if the request to cancel wasn't
1206
+ // found, the cancel request is completed with -ENOENT.
1207
+ //
1208
+ // Every handle has to wait for 2 requests completion: original one and
1209
+ // the cancel request which is tracked by PosixHandle::req_count.
1210
+ if (posix_handle->req_count == 2 &&
1211
+ static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
1212
+ posix_handle->is_finished = true;
1213
+ FSReadRequest req;
1214
+ req.status = IOStatus::Aborted();
1215
+ posix_handle->cb(req, posix_handle->cb_arg);
1216
+
1217
+ break;
1218
+ }
1219
+ }
1220
+ }
1221
+ return IOStatus::OK();
1222
+ #else
1223
+ // If Poll is not supported then it didn't submit any request and it should
1224
+ // return OK.
1225
+ (void)io_handles;
1226
+ return IOStatus::OK();
1227
+ #endif
1138
1228
  }
1139
1229
 
1140
1230
  #if defined(ROCKSDB_IOURING_PRESENT)
@@ -88,22 +88,17 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
88
88
  #endif
89
89
  }
90
90
 
91
- int FadviseForHint(int fd, FSRandomAccessFile::AccessPattern pattern) {
92
- switch (pattern) {
93
- case FSRandomAccessFile::AccessPattern::kNormal:
94
- return Fadvise(fd, 0, 0, POSIX_FADV_NORMAL);
95
- case FSRandomAccessFile::AccessPattern::kRandom:
96
- return Fadvise(fd, 0, 0, POSIX_FADV_RANDOM);
97
- case FSRandomAccessFile::AccessPattern::kSequential:
98
- return Fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
99
- case FSRandomAccessFile::AccessPattern::kWillNeed:
100
- return Fadvise(fd, 0, 0, POSIX_FADV_WILLNEED);
101
- case FSRandomAccessFile::AccessPattern::kWontNeed:
102
- return Fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
103
- default:
104
- assert(false);
105
- return 1;
106
- }
91
+ // A wrapper for fadvise, if the platform doesn't support fadvise,
92
+ // it will simply return 0.
93
+ int Madvise(void* addr, size_t len, int advice) {
94
+ #ifdef OS_LINUX
95
+ return posix_madvise(addr, len, advice);
96
+ #else
97
+ (void)addr;
98
+ (void)len;
99
+ (void)advice;
100
+ return 0; // simply do nothing.
101
+ #endif
107
102
  }
108
103
 
109
104
  namespace {
@@ -839,7 +834,26 @@ void PosixRandomAccessFile::Hint(AccessPattern pattern) {
839
834
  if (use_direct_io()) {
840
835
  return;
841
836
  }
842
- FadviseForHint(fd_, pattern);
837
+ switch (pattern) {
838
+ case kNormal:
839
+ Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
840
+ break;
841
+ case kRandom:
842
+ Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
843
+ break;
844
+ case kSequential:
845
+ Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
846
+ break;
847
+ case kWillNeed:
848
+ Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
849
+ break;
850
+ case kWontNeed:
851
+ Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
852
+ break;
853
+ default:
854
+ assert(false);
855
+ break;
856
+ }
843
857
  }
844
858
 
845
859
  IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
@@ -982,7 +996,26 @@ IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
982
996
  }
983
997
 
984
998
  void PosixMmapReadableFile::Hint(AccessPattern pattern) {
985
- FadviseForHint(fd_, pattern);
999
+ switch (pattern) {
1000
+ case kNormal:
1001
+ Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
1002
+ break;
1003
+ case kRandom:
1004
+ Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
1005
+ break;
1006
+ case kSequential:
1007
+ Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
1008
+ break;
1009
+ case kWillNeed:
1010
+ Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
1011
+ break;
1012
+ case kWontNeed:
1013
+ Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
1014
+ break;
1015
+ default:
1016
+ assert(false);
1017
+ break;
1018
+ }
986
1019
  }
987
1020
 
988
1021
  IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
@@ -62,6 +62,8 @@ struct Posix_IOHandle {
62
62
  size_t len;
63
63
  char* scratch;
64
64
  bool is_finished = false;
65
+ // req_count is used by AbortIO API to keep track of number of requests.
66
+ uint32_t req_count = 0;
65
67
  };
66
68
 
67
69
  inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,