@nxtedition/rocksdb 7.0.0 → 7.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +38 -40
- package/deps/rocksdb/rocksdb/CMakeLists.txt +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +3 -1
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +28 -0
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +5 -2
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +48 -60
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +18 -20
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
- package/deps/rocksdb/rocksdb/db/c.cc +5 -0
- package/deps/rocksdb/rocksdb/db/column_family.cc +20 -0
- package/deps/rocksdb/rocksdb/db/column_family.h +9 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +44 -26
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +32 -14
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +73 -44
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +6 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +10 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +47 -35
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +54 -32
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +426 -61
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_test.cc +102 -24
- package/deps/rocksdb/rocksdb/db/db_test2.cc +159 -30
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +1 -0
- package/deps/rocksdb/rocksdb/db/dbformat.h +1 -1
- package/deps/rocksdb/rocksdb/db/version_builder.cc +39 -10
- package/deps/rocksdb/rocksdb/db/version_builder.h +4 -1
- package/deps/rocksdb/rocksdb/db/version_edit.h +20 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +2 -1
- package/deps/rocksdb/rocksdb/db/version_set.h +17 -2
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +119 -0
- package/deps/rocksdb/rocksdb/db/write_batch.cc +96 -0
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -0
- package/deps/rocksdb/rocksdb/db/write_thread.cc +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +18 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +12 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +1 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +96 -6
- package/deps/rocksdb/rocksdb/env/io_posix.cc +51 -18
- package/deps/rocksdb/rocksdb/env/io_posix.h +2 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +12 -5
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +22 -6
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +99 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +9 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +11 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +4 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +6 -0
- package/deps/rocksdb/rocksdb/options/cf_options.cc +12 -1
- package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
- package/deps/rocksdb/rocksdb/options/options.cc +8 -1
- package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
- package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -1
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +7 -2
- package/deps/rocksdb/rocksdb/options/options_test.cc +52 -0
- package/deps/rocksdb/rocksdb/port/port_posix.h +10 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -10
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +39 -12
- package/deps/rocksdb/rocksdb/util/comparator.cc +10 -0
- package/deps/rocksdb/rocksdb/util/ribbon_alg.h +1 -1
- package/deps/rocksdb/rocksdb/util/xxhash.h +2 -1
- package/index.js +2 -2
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -699,8 +699,13 @@ struct ObsoleteFileInfo {
|
|
|
699
699
|
|
|
700
700
|
ObsoleteFileInfo() noexcept
|
|
701
701
|
: metadata(nullptr), only_delete_metadata(false) {}
|
|
702
|
-
ObsoleteFileInfo(FileMetaData* f, const std::string& file_path
|
|
703
|
-
|
|
702
|
+
ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
|
|
703
|
+
std::shared_ptr<CacheReservationManager>
|
|
704
|
+
file_metadata_cache_res_mgr_arg = nullptr)
|
|
705
|
+
: metadata(f),
|
|
706
|
+
path(file_path),
|
|
707
|
+
only_delete_metadata(false),
|
|
708
|
+
file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {}
|
|
704
709
|
|
|
705
710
|
ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
|
|
706
711
|
ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
|
|
@@ -713,13 +718,23 @@ struct ObsoleteFileInfo {
|
|
|
713
718
|
path = std::move(rhs.path);
|
|
714
719
|
metadata = rhs.metadata;
|
|
715
720
|
rhs.metadata = nullptr;
|
|
721
|
+
file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
|
|
722
|
+
rhs.file_metadata_cache_res_mgr = nullptr;
|
|
716
723
|
|
|
717
724
|
return *this;
|
|
718
725
|
}
|
|
719
726
|
void DeleteMetadata() {
|
|
727
|
+
if (file_metadata_cache_res_mgr) {
|
|
728
|
+
Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
|
|
729
|
+
metadata->ApproximateMemoryUsage(), false /* increase */);
|
|
730
|
+
s.PermitUncheckedError();
|
|
731
|
+
}
|
|
720
732
|
delete metadata;
|
|
721
733
|
metadata = nullptr;
|
|
722
734
|
}
|
|
735
|
+
|
|
736
|
+
private:
|
|
737
|
+
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
|
|
723
738
|
};
|
|
724
739
|
|
|
725
740
|
class ObsoleteBlobFileInfo {
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include <algorithm>
|
|
13
13
|
|
|
14
14
|
#include "db/db_impl/db_impl.h"
|
|
15
|
+
#include "db/db_test_util.h"
|
|
15
16
|
#include "db/log_writer.h"
|
|
16
17
|
#include "rocksdb/advanced_options.h"
|
|
17
18
|
#include "rocksdb/convenience.h"
|
|
@@ -3446,6 +3447,124 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
|
|
|
3446
3447
|
}
|
|
3447
3448
|
}
|
|
3448
3449
|
|
|
3450
|
+
class ChargeFileMetadataTest : public DBTestBase {
|
|
3451
|
+
public:
|
|
3452
|
+
ChargeFileMetadataTest()
|
|
3453
|
+
: DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {}
|
|
3454
|
+
};
|
|
3455
|
+
|
|
3456
|
+
class ChargeFileMetadataTestWithParam
|
|
3457
|
+
: public ChargeFileMetadataTest,
|
|
3458
|
+
public testing::WithParamInterface<CacheEntryRoleOptions::Decision> {
|
|
3459
|
+
public:
|
|
3460
|
+
ChargeFileMetadataTestWithParam() {}
|
|
3461
|
+
};
|
|
3462
|
+
|
|
3463
|
+
#ifndef ROCKSDB_LITE
|
|
3464
|
+
INSTANTIATE_TEST_CASE_P(
|
|
3465
|
+
ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam,
|
|
3466
|
+
::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
|
|
3467
|
+
CacheEntryRoleOptions::Decision::kDisabled));
|
|
3468
|
+
|
|
3469
|
+
TEST_P(ChargeFileMetadataTestWithParam, Basic) {
|
|
3470
|
+
Options options;
|
|
3471
|
+
BlockBasedTableOptions table_options;
|
|
3472
|
+
CacheEntryRoleOptions::Decision charge_file_metadata = GetParam();
|
|
3473
|
+
table_options.cache_usage_options.options_overrides.insert(
|
|
3474
|
+
{CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}});
|
|
3475
|
+
std::shared_ptr<TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>
|
|
3476
|
+
file_metadata_charge_only_cache = std::make_shared<
|
|
3477
|
+
TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
|
|
3478
|
+
NewLRUCache(
|
|
3479
|
+
4 * CacheReservationManagerImpl<
|
|
3480
|
+
CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
|
|
3481
|
+
0 /* num_shard_bits */, true /* strict_capacity_limit */));
|
|
3482
|
+
table_options.block_cache = file_metadata_charge_only_cache;
|
|
3483
|
+
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
3484
|
+
options.create_if_missing = true;
|
|
3485
|
+
options.disable_auto_compactions = true;
|
|
3486
|
+
DestroyAndReopen(options);
|
|
3487
|
+
|
|
3488
|
+
// Create 128 file metadata, each of which is roughly 1024 bytes.
|
|
3489
|
+
// This results in 1 *
|
|
3490
|
+
// CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
|
|
3491
|
+
// cache reservation for file metadata.
|
|
3492
|
+
for (int i = 1; i <= 128; ++i) {
|
|
3493
|
+
ASSERT_OK(Put(std::string(1024, 'a'), "va"));
|
|
3494
|
+
ASSERT_OK(Put("b", "vb"));
|
|
3495
|
+
ASSERT_OK(Flush());
|
|
3496
|
+
}
|
|
3497
|
+
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
3498
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
3499
|
+
1 * CacheReservationManagerImpl<
|
|
3500
|
+
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
3501
|
+
|
|
3502
|
+
} else {
|
|
3503
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
|
|
3504
|
+
}
|
|
3505
|
+
|
|
3506
|
+
// Create another 128 file metadata.
|
|
3507
|
+
// This increases the file metadata cache reservation to 2 *
|
|
3508
|
+
// CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize().
|
|
3509
|
+
for (int i = 1; i <= 128; ++i) {
|
|
3510
|
+
ASSERT_OK(Put(std::string(1024, 'a'), "vva"));
|
|
3511
|
+
ASSERT_OK(Put("b", "vvb"));
|
|
3512
|
+
ASSERT_OK(Flush());
|
|
3513
|
+
}
|
|
3514
|
+
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
3515
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
3516
|
+
2 * CacheReservationManagerImpl<
|
|
3517
|
+
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
3518
|
+
} else {
|
|
3519
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
|
|
3520
|
+
}
|
|
3521
|
+
// Compaction will create 1 new file metadata, obsolete and delete all 256
|
|
3522
|
+
// file metadata above. This results in 1 *
|
|
3523
|
+
// CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
|
|
3524
|
+
// cache reservation for file metadata.
|
|
3525
|
+
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
3526
|
+
ASSERT_EQ("0,1", FilesPerLevel(0));
|
|
3527
|
+
|
|
3528
|
+
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
3529
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
3530
|
+
1 * CacheReservationManagerImpl<
|
|
3531
|
+
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
3532
|
+
} else {
|
|
3533
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
|
|
3534
|
+
}
|
|
3535
|
+
|
|
3536
|
+
// Destroying the db will delete the remaining 1 new file metadata
|
|
3537
|
+
// This results in no cache reservation for file metadata.
|
|
3538
|
+
Destroy(options);
|
|
3539
|
+
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
3540
|
+
0 * CacheReservationManagerImpl<
|
|
3541
|
+
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
3542
|
+
|
|
3543
|
+
// Reopen the db with a smaller cache in order to test failure in allocating
|
|
3544
|
+
// file metadata due to memory limit based on cache capacity
|
|
3545
|
+
file_metadata_charge_only_cache = std::make_shared<
|
|
3546
|
+
TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
|
|
3547
|
+
NewLRUCache(1 * CacheReservationManagerImpl<
|
|
3548
|
+
CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
|
|
3549
|
+
0 /* num_shard_bits */, true /* strict_capacity_limit */));
|
|
3550
|
+
table_options.block_cache = file_metadata_charge_only_cache;
|
|
3551
|
+
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
3552
|
+
Reopen(options);
|
|
3553
|
+
ASSERT_OK(Put(std::string(1024, 'a'), "va"));
|
|
3554
|
+
ASSERT_OK(Put("b", "vb"));
|
|
3555
|
+
Status s = Flush();
|
|
3556
|
+
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
3557
|
+
EXPECT_TRUE(s.IsMemoryLimit());
|
|
3558
|
+
EXPECT_TRUE(s.ToString().find(
|
|
3559
|
+
kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
|
|
3560
|
+
CacheEntryRole::kFileMetadata)]) != std::string::npos);
|
|
3561
|
+
EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
|
|
3562
|
+
std::string::npos);
|
|
3563
|
+
} else {
|
|
3564
|
+
EXPECT_TRUE(s.ok());
|
|
3565
|
+
}
|
|
3566
|
+
}
|
|
3567
|
+
#endif // ROCKSDB_LITE
|
|
3449
3568
|
} // namespace ROCKSDB_NAMESPACE
|
|
3450
3569
|
|
|
3451
3570
|
int main(int argc, char** argv) {
|
|
@@ -1491,6 +1491,94 @@ Status WriteBatch::UpdateTimestamps(
|
|
|
1491
1491
|
return s;
|
|
1492
1492
|
}
|
|
1493
1493
|
|
|
1494
|
+
Status WriteBatch::VerifyChecksum() const {
|
|
1495
|
+
if (prot_info_ == nullptr) {
|
|
1496
|
+
return Status::OK();
|
|
1497
|
+
}
|
|
1498
|
+
Slice input(rep_.data() + WriteBatchInternal::kHeader,
|
|
1499
|
+
rep_.size() - WriteBatchInternal::kHeader);
|
|
1500
|
+
Slice key, value, blob, xid;
|
|
1501
|
+
char tag = 0;
|
|
1502
|
+
uint32_t column_family = 0; // default
|
|
1503
|
+
Status s;
|
|
1504
|
+
size_t prot_info_idx = 0;
|
|
1505
|
+
bool checksum_protected = true;
|
|
1506
|
+
while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) {
|
|
1507
|
+
// In case key/value/column_family are not updated by
|
|
1508
|
+
// ReadRecordFromWriteBatch
|
|
1509
|
+
key.clear();
|
|
1510
|
+
value.clear();
|
|
1511
|
+
column_family = 0;
|
|
1512
|
+
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
|
|
1513
|
+
&blob, &xid);
|
|
1514
|
+
if (!s.ok()) {
|
|
1515
|
+
return s;
|
|
1516
|
+
}
|
|
1517
|
+
checksum_protected = true;
|
|
1518
|
+
// Write batch checksum uses op_type without ColumnFamily (e.g., if op_type
|
|
1519
|
+
// in the write batch is kTypeColumnFamilyValue, kTypeValue is used to
|
|
1520
|
+
// compute the checksum), and encodes column family id separately. See
|
|
1521
|
+
// comment in first `WriteBatchInternal::Put()` for more detail.
|
|
1522
|
+
switch (tag) {
|
|
1523
|
+
case kTypeColumnFamilyValue:
|
|
1524
|
+
case kTypeValue:
|
|
1525
|
+
tag = kTypeValue;
|
|
1526
|
+
break;
|
|
1527
|
+
case kTypeColumnFamilyDeletion:
|
|
1528
|
+
case kTypeDeletion:
|
|
1529
|
+
tag = kTypeDeletion;
|
|
1530
|
+
break;
|
|
1531
|
+
case kTypeColumnFamilySingleDeletion:
|
|
1532
|
+
case kTypeSingleDeletion:
|
|
1533
|
+
tag = kTypeSingleDeletion;
|
|
1534
|
+
break;
|
|
1535
|
+
case kTypeColumnFamilyRangeDeletion:
|
|
1536
|
+
case kTypeRangeDeletion:
|
|
1537
|
+
tag = kTypeRangeDeletion;
|
|
1538
|
+
break;
|
|
1539
|
+
case kTypeColumnFamilyMerge:
|
|
1540
|
+
case kTypeMerge:
|
|
1541
|
+
tag = kTypeMerge;
|
|
1542
|
+
break;
|
|
1543
|
+
case kTypeColumnFamilyBlobIndex:
|
|
1544
|
+
case kTypeBlobIndex:
|
|
1545
|
+
tag = kTypeBlobIndex;
|
|
1546
|
+
break;
|
|
1547
|
+
case kTypeLogData:
|
|
1548
|
+
case kTypeBeginPrepareXID:
|
|
1549
|
+
case kTypeEndPrepareXID:
|
|
1550
|
+
case kTypeCommitXID:
|
|
1551
|
+
case kTypeRollbackXID:
|
|
1552
|
+
case kTypeNoop:
|
|
1553
|
+
case kTypeBeginPersistedPrepareXID:
|
|
1554
|
+
case kTypeBeginUnprepareXID:
|
|
1555
|
+
case kTypeDeletionWithTimestamp:
|
|
1556
|
+
case kTypeCommitXIDAndTimestamp:
|
|
1557
|
+
checksum_protected = false;
|
|
1558
|
+
break;
|
|
1559
|
+
default:
|
|
1560
|
+
return Status::Corruption(
|
|
1561
|
+
"unknown WriteBatch tag",
|
|
1562
|
+
std::to_string(static_cast<unsigned int>(tag)));
|
|
1563
|
+
}
|
|
1564
|
+
if (checksum_protected) {
|
|
1565
|
+
s = prot_info_->entries_[prot_info_idx++]
|
|
1566
|
+
.StripC(column_family)
|
|
1567
|
+
.StripKVO(key, value, static_cast<ValueType>(tag))
|
|
1568
|
+
.GetStatus();
|
|
1569
|
+
if (!s.ok()) {
|
|
1570
|
+
return s;
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
if (prot_info_idx != WriteBatchInternal::Count(this)) {
|
|
1576
|
+
return Status::Corruption("WriteBatch has wrong count");
|
|
1577
|
+
}
|
|
1578
|
+
assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size());
|
|
1579
|
+
return Status::OK();
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1494
1582
|
namespace {
|
|
1495
1583
|
|
|
1496
1584
|
class MemTableInserter : public WriteBatch::Handler {
|
|
@@ -2773,6 +2861,14 @@ Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
|
|
|
2773
2861
|
const bool wal_only) {
|
|
2774
2862
|
assert(dst->Count() == 0 ||
|
|
2775
2863
|
(dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
|
|
2864
|
+
if ((src->prot_info_ != nullptr &&
|
|
2865
|
+
src->prot_info_->entries_.size() != src->Count()) ||
|
|
2866
|
+
(dst->prot_info_ != nullptr &&
|
|
2867
|
+
dst->prot_info_->entries_.size() != dst->Count())) {
|
|
2868
|
+
return Status::Corruption(
|
|
2869
|
+
"Write batch has inconsistent count and number of checksums");
|
|
2870
|
+
}
|
|
2871
|
+
|
|
2776
2872
|
size_t src_len;
|
|
2777
2873
|
int src_count;
|
|
2778
2874
|
uint32_t src_flags;
|
|
@@ -206,6 +206,10 @@ class WriteBatchInternal {
|
|
|
206
206
|
bool batch_per_txn = true,
|
|
207
207
|
bool hint_per_batch = false);
|
|
208
208
|
|
|
209
|
+
// Appends src write batch to dst write batch and updates count in dst
|
|
210
|
+
// write batch. Returns OK if the append is successful. Checks number of
|
|
211
|
+
// checksum against count in dst and src write batches, and returns Corruption
|
|
212
|
+
// if the count is inconsistent.
|
|
209
213
|
static Status Append(WriteBatch* dst, const WriteBatch* src,
|
|
210
214
|
const bool WAL_only = false);
|
|
211
215
|
|
|
@@ -139,6 +139,7 @@ DECLARE_bool(cache_index_and_filter_blocks);
|
|
|
139
139
|
DECLARE_bool(charge_compression_dictionary_building_buffer);
|
|
140
140
|
DECLARE_bool(charge_filter_construction);
|
|
141
141
|
DECLARE_bool(charge_table_reader);
|
|
142
|
+
DECLARE_bool(charge_file_metadata);
|
|
142
143
|
DECLARE_int32(top_level_index_pinning);
|
|
143
144
|
DECLARE_int32(partition_pinning);
|
|
144
145
|
DECLARE_int32(unpartitioned_pinning);
|
|
@@ -297,6 +298,8 @@ DECLARE_bool(verify_sst_unique_id_in_manifest);
|
|
|
297
298
|
|
|
298
299
|
DECLARE_int32(create_timestamped_snapshot_one_in);
|
|
299
300
|
|
|
301
|
+
DECLARE_bool(allow_data_in_errors);
|
|
302
|
+
|
|
300
303
|
constexpr long KB = 1024;
|
|
301
304
|
constexpr int kRandomValueMaxFactor = 3;
|
|
302
305
|
constexpr int kValueMaxLen = 100;
|
|
@@ -325,6 +325,11 @@ DEFINE_bool(charge_table_reader, false,
|
|
|
325
325
|
"CacheEntryRoleOptions::charged of"
|
|
326
326
|
"CacheEntryRole::kBlockBasedTableReader");
|
|
327
327
|
|
|
328
|
+
DEFINE_bool(charge_file_metadata, false,
|
|
329
|
+
"Setting for "
|
|
330
|
+
"CacheEntryRoleOptions::charged of"
|
|
331
|
+
"kFileMetadata");
|
|
332
|
+
|
|
328
333
|
DEFINE_int32(
|
|
329
334
|
top_level_index_pinning,
|
|
330
335
|
static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
|
|
@@ -971,4 +976,8 @@ DEFINE_int32(
|
|
|
971
976
|
create_timestamped_snapshot_one_in, 0,
|
|
972
977
|
"On non-zero, create timestamped snapshots upon transaction commits.");
|
|
973
978
|
|
|
979
|
+
DEFINE_bool(allow_data_in_errors,
|
|
980
|
+
ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
|
|
981
|
+
"If true, allow logging data, e.g. key, value in LOG files.");
|
|
982
|
+
|
|
974
983
|
#endif // GFLAGS
|
|
@@ -66,6 +66,7 @@ StressTest::StressTest()
|
|
|
66
66
|
#ifndef ROCKSDB_LITE
|
|
67
67
|
txn_db_(nullptr),
|
|
68
68
|
#endif
|
|
69
|
+
db_aptr_(nullptr),
|
|
69
70
|
clock_(db_stress_env->GetSystemClock().get()),
|
|
70
71
|
new_column_family_name_(1),
|
|
71
72
|
num_times_reopened_(0),
|
|
@@ -129,7 +130,9 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
|
|
|
129
130
|
}
|
|
130
131
|
return cache;
|
|
131
132
|
} else if (FLAGS_cache_type == "fast_lru_cache") {
|
|
132
|
-
return NewFastLRUCache((
|
|
133
|
+
return NewFastLRUCache(static_cast<size_t>(capacity), FLAGS_block_size,
|
|
134
|
+
num_shard_bits, false /*strict_capacity_limit*/,
|
|
135
|
+
kDefaultCacheMetadataChargePolicy);
|
|
133
136
|
} else if (FLAGS_cache_type == "lru_cache") {
|
|
134
137
|
LRUCacheOptions opts;
|
|
135
138
|
opts.capacity = capacity;
|
|
@@ -2541,7 +2544,13 @@ void StressTest::Open(SharedState* shared) {
|
|
|
2541
2544
|
fflush(stderr);
|
|
2542
2545
|
}
|
|
2543
2546
|
assert(s.ok());
|
|
2544
|
-
|
|
2547
|
+
|
|
2548
|
+
// Do not swap the order of the following.
|
|
2549
|
+
{
|
|
2550
|
+
db_ = txn_db_;
|
|
2551
|
+
db_aptr_.store(txn_db_, std::memory_order_release);
|
|
2552
|
+
}
|
|
2553
|
+
|
|
2545
2554
|
// after a crash, rollback to commit recovered transactions
|
|
2546
2555
|
std::vector<Transaction*> trans;
|
|
2547
2556
|
txn_db_->GetAllPreparedTransactions(&trans);
|
|
@@ -2757,6 +2766,11 @@ void InitializeOptionsFromFlags(
|
|
|
2757
2766
|
{/*.charged = */ FLAGS_charge_table_reader
|
|
2758
2767
|
? CacheEntryRoleOptions::Decision::kEnabled
|
|
2759
2768
|
: CacheEntryRoleOptions::Decision::kDisabled}});
|
|
2769
|
+
block_based_options.cache_usage_options.options_overrides.insert(
|
|
2770
|
+
{CacheEntryRole::kFileMetadata,
|
|
2771
|
+
{/*.charged = */ FLAGS_charge_file_metadata
|
|
2772
|
+
? CacheEntryRoleOptions::Decision::kEnabled
|
|
2773
|
+
: CacheEntryRoleOptions::Decision::kDisabled}});
|
|
2760
2774
|
block_based_options.format_version =
|
|
2761
2775
|
static_cast<uint32_t>(FLAGS_format_version);
|
|
2762
2776
|
block_based_options.index_block_restart_interval =
|
|
@@ -2917,6 +2931,8 @@ void InitializeOptionsFromFlags(
|
|
|
2917
2931
|
if (FLAGS_user_timestamp_size > 0) {
|
|
2918
2932
|
CheckAndSetOptionsForUserTimestamp(options);
|
|
2919
2933
|
}
|
|
2934
|
+
|
|
2935
|
+
options.allow_data_in_errors = FLAGS_allow_data_in_errors;
|
|
2920
2936
|
}
|
|
2921
2937
|
|
|
2922
2938
|
void InitializeOptionsGeneral(
|
|
@@ -236,6 +236,10 @@ class StressTest {
|
|
|
236
236
|
#ifndef ROCKSDB_LITE
|
|
237
237
|
TransactionDB* txn_db_;
|
|
238
238
|
#endif
|
|
239
|
+
|
|
240
|
+
// Currently only used in MultiOpsTxnsStressTest
|
|
241
|
+
std::atomic<DB*> db_aptr_;
|
|
242
|
+
|
|
239
243
|
Options options_;
|
|
240
244
|
SystemClock* clock_;
|
|
241
245
|
std::vector<ColumnFamilyHandle*> column_families_;
|
|
@@ -1248,7 +1248,19 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const {
|
|
|
1248
1248
|
}
|
|
1249
1249
|
}
|
|
1250
1250
|
|
|
1251
|
+
// VerifyPkSkFast() can be called by MultiOpsTxnsStressListener's callbacks
|
|
1252
|
+
// which can be called before TransactionDB::Open() returns to caller.
|
|
1253
|
+
// Therefore, at that time, db_ and txn_db_ may still be nullptr.
|
|
1254
|
+
// Caller has to make sure that the race condition does not happen.
|
|
1251
1255
|
void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) {
|
|
1256
|
+
DB* const db = db_aptr_.load(std::memory_order_acquire);
|
|
1257
|
+
if (db == nullptr) {
|
|
1258
|
+
return;
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
assert(db_ == db);
|
|
1262
|
+
assert(db_ != nullptr);
|
|
1263
|
+
|
|
1252
1264
|
const Snapshot* const snapshot = db_->GetSnapshot();
|
|
1253
1265
|
assert(snapshot);
|
|
1254
1266
|
ManagedSnapshot snapshot_guard(db_, snapshot);
|
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
|
|
10
10
|
#ifdef GFLAGS
|
|
11
11
|
#include "db_stress_tool/db_stress_common.h"
|
|
12
|
-
#include "utilities/fault_injection_fs.h"
|
|
13
12
|
#include "rocksdb/utilities/transaction_db.h"
|
|
13
|
+
#include "utilities/fault_injection_fs.h"
|
|
14
14
|
|
|
15
15
|
namespace ROCKSDB_NAMESPACE {
|
|
16
16
|
class NonBatchedOpsStressTest : public StressTest {
|
|
@@ -1124,17 +1124,107 @@ class PosixFileSystem : public FileSystem {
|
|
|
1124
1124
|
#endif
|
|
1125
1125
|
}
|
|
1126
1126
|
|
|
1127
|
-
// TODO akanksha: Look into flags and see how to provide support for AbortIO
|
|
1128
|
-
// in posix for IOUring requests. Currently it calls Poll to wait for requests
|
|
1129
|
-
// to complete the request.
|
|
1130
1127
|
virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
|
|
1131
|
-
|
|
1128
|
+
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
1129
|
+
// io_uring_queue_init.
|
|
1130
|
+
struct io_uring* iu = nullptr;
|
|
1131
|
+
if (thread_local_io_urings_) {
|
|
1132
|
+
iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
// Init failed, platform doesn't support io_uring.
|
|
1132
1136
|
// If Poll is not supported then it didn't submit any request and it should
|
|
1133
1137
|
// return OK.
|
|
1134
|
-
if (
|
|
1138
|
+
if (iu == nullptr) {
|
|
1135
1139
|
return IOStatus::OK();
|
|
1136
1140
|
}
|
|
1137
|
-
|
|
1141
|
+
|
|
1142
|
+
for (size_t i = 0; i < io_handles.size(); i++) {
|
|
1143
|
+
Posix_IOHandle* posix_handle =
|
|
1144
|
+
static_cast<Posix_IOHandle*>(io_handles[i]);
|
|
1145
|
+
if (posix_handle->is_finished == true) {
|
|
1146
|
+
continue;
|
|
1147
|
+
}
|
|
1148
|
+
assert(posix_handle->iu == iu);
|
|
1149
|
+
if (posix_handle->iu != iu) {
|
|
1150
|
+
return IOStatus::IOError("");
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
// Prepare the cancel request.
|
|
1154
|
+
struct io_uring_sqe* sqe;
|
|
1155
|
+
sqe = io_uring_get_sqe(iu);
|
|
1156
|
+
// prep_cancel changed API in liburing, but we need to support both old
|
|
1157
|
+
// and new versions so do it by hand
|
|
1158
|
+
io_uring_prep_cancel(sqe, 0, 0);
|
|
1159
|
+
sqe->addr = reinterpret_cast<uint64_t>(posix_handle);
|
|
1160
|
+
io_uring_sqe_set_data(sqe, posix_handle);
|
|
1161
|
+
|
|
1162
|
+
// submit the request.
|
|
1163
|
+
ssize_t ret = io_uring_submit(iu);
|
|
1164
|
+
if (ret < 0) {
|
|
1165
|
+
fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
|
|
1166
|
+
return IOStatus::IOError("io_uring_submit() requested but returned " +
|
|
1167
|
+
std::to_string(ret));
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
// After submitting the requests, wait for the requests.
|
|
1172
|
+
for (size_t i = 0; i < io_handles.size(); i++) {
|
|
1173
|
+
if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
|
|
1174
|
+
continue;
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
while (true) {
|
|
1178
|
+
struct io_uring_cqe* cqe = nullptr;
|
|
1179
|
+
ssize_t ret = io_uring_wait_cqe(iu, &cqe);
|
|
1180
|
+
if (ret) {
|
|
1181
|
+
// abort as it shouldn't be in indeterminate state and there is no
|
|
1182
|
+
// good way currently to handle this error.
|
|
1183
|
+
abort();
|
|
1184
|
+
}
|
|
1185
|
+
assert(cqe != nullptr);
|
|
1186
|
+
|
|
1187
|
+
Posix_IOHandle* posix_handle =
|
|
1188
|
+
static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
|
|
1189
|
+
assert(posix_handle->iu == iu);
|
|
1190
|
+
if (posix_handle->iu != iu) {
|
|
1191
|
+
return IOStatus::IOError("");
|
|
1192
|
+
}
|
|
1193
|
+
posix_handle->req_count++;
|
|
1194
|
+
|
|
1195
|
+
// Reset cqe data to catch any stray reuse of it
|
|
1196
|
+
static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
|
|
1197
|
+
io_uring_cqe_seen(iu, cqe);
|
|
1198
|
+
|
|
1199
|
+
// - If the request is cancelled successfully, the original request is
|
|
1200
|
+
// completed with -ECANCELED and the cancel request is completed with
|
|
1201
|
+
// a result of 0.
|
|
1202
|
+
// - If the request was already running, the original may or
|
|
1203
|
+
// may not complete in error. The cancel request will complete with
|
|
1204
|
+
// -EALREADY for that case.
|
|
1205
|
+
// - And finally, if the request to cancel wasn't
|
|
1206
|
+
// found, the cancel request is completed with -ENOENT.
|
|
1207
|
+
//
|
|
1208
|
+
// Every handle has to wait for 2 requests completion: original one and
|
|
1209
|
+
// the cancel request which is tracked by PosixHandle::req_count.
|
|
1210
|
+
if (posix_handle->req_count == 2 &&
|
|
1211
|
+
static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
|
|
1212
|
+
posix_handle->is_finished = true;
|
|
1213
|
+
FSReadRequest req;
|
|
1214
|
+
req.status = IOStatus::Aborted();
|
|
1215
|
+
posix_handle->cb(req, posix_handle->cb_arg);
|
|
1216
|
+
|
|
1217
|
+
break;
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
}
|
|
1221
|
+
return IOStatus::OK();
|
|
1222
|
+
#else
|
|
1223
|
+
// If Poll is not supported then it didn't submit any request and it should
|
|
1224
|
+
// return OK.
|
|
1225
|
+
(void)io_handles;
|
|
1226
|
+
return IOStatus::OK();
|
|
1227
|
+
#endif
|
|
1138
1228
|
}
|
|
1139
1229
|
|
|
1140
1230
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
@@ -88,22 +88,17 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
|
|
|
88
88
|
#endif
|
|
89
89
|
}
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
return Fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
|
|
103
|
-
default:
|
|
104
|
-
assert(false);
|
|
105
|
-
return 1;
|
|
106
|
-
}
|
|
91
|
+
// A wrapper for fadvise, if the platform doesn't support fadvise,
|
|
92
|
+
// it will simply return 0.
|
|
93
|
+
int Madvise(void* addr, size_t len, int advice) {
|
|
94
|
+
#ifdef OS_LINUX
|
|
95
|
+
return posix_madvise(addr, len, advice);
|
|
96
|
+
#else
|
|
97
|
+
(void)addr;
|
|
98
|
+
(void)len;
|
|
99
|
+
(void)advice;
|
|
100
|
+
return 0; // simply do nothing.
|
|
101
|
+
#endif
|
|
107
102
|
}
|
|
108
103
|
|
|
109
104
|
namespace {
|
|
@@ -839,7 +834,26 @@ void PosixRandomAccessFile::Hint(AccessPattern pattern) {
|
|
|
839
834
|
if (use_direct_io()) {
|
|
840
835
|
return;
|
|
841
836
|
}
|
|
842
|
-
|
|
837
|
+
switch (pattern) {
|
|
838
|
+
case kNormal:
|
|
839
|
+
Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
|
|
840
|
+
break;
|
|
841
|
+
case kRandom:
|
|
842
|
+
Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
|
|
843
|
+
break;
|
|
844
|
+
case kSequential:
|
|
845
|
+
Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
|
|
846
|
+
break;
|
|
847
|
+
case kWillNeed:
|
|
848
|
+
Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
|
|
849
|
+
break;
|
|
850
|
+
case kWontNeed:
|
|
851
|
+
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
|
|
852
|
+
break;
|
|
853
|
+
default:
|
|
854
|
+
assert(false);
|
|
855
|
+
break;
|
|
856
|
+
}
|
|
843
857
|
}
|
|
844
858
|
|
|
845
859
|
IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
|
|
@@ -982,7 +996,26 @@ IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
|
|
|
982
996
|
}
|
|
983
997
|
|
|
984
998
|
void PosixMmapReadableFile::Hint(AccessPattern pattern) {
|
|
985
|
-
|
|
999
|
+
switch (pattern) {
|
|
1000
|
+
case kNormal:
|
|
1001
|
+
Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
|
|
1002
|
+
break;
|
|
1003
|
+
case kRandom:
|
|
1004
|
+
Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
|
|
1005
|
+
break;
|
|
1006
|
+
case kSequential:
|
|
1007
|
+
Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
|
|
1008
|
+
break;
|
|
1009
|
+
case kWillNeed:
|
|
1010
|
+
Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
|
|
1011
|
+
break;
|
|
1012
|
+
case kWontNeed:
|
|
1013
|
+
Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
|
|
1014
|
+
break;
|
|
1015
|
+
default:
|
|
1016
|
+
assert(false);
|
|
1017
|
+
break;
|
|
1018
|
+
}
|
|
986
1019
|
}
|
|
987
1020
|
|
|
988
1021
|
IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
|
|
@@ -62,6 +62,8 @@ struct Posix_IOHandle {
|
|
|
62
62
|
size_t len;
|
|
63
63
|
char* scratch;
|
|
64
64
|
bool is_finished = false;
|
|
65
|
+
// req_count is used by AbortIO API to keep track of number of requests.
|
|
66
|
+
uint32_t req_count = 0;
|
|
65
67
|
};
|
|
66
68
|
|
|
67
69
|
inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
|