@nxtedition/rocksdb 7.1.4 → 7.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/deps/rocksdb/iostats.patch +19 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +15 -1
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +93 -58
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -40
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +57 -32
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +103 -28
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +33 -1
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +177 -38
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +3 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +125 -71
- package/deps/rocksdb/rocksdb/crash_test.mk +15 -1
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +2 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_index.h +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_log_format.cc +3 -5
- package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +25 -19
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
- package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +36 -0
- package/deps/rocksdb/rocksdb/db/column_family.cc +2 -15
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +17 -4
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +8 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +0 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +50 -52
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +33 -11
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +41 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +1 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +143 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +43 -18
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +48 -65
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +73 -4
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +17 -8
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +71 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +144 -33
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +18 -35
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +11 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +7 -7
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +15 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +11 -0
- package/deps/rocksdb/rocksdb/db/db_iter.cc +69 -11
- package/deps/rocksdb/rocksdb/db/db_iter.h +16 -0
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +42 -0
- package/deps/rocksdb/rocksdb/db/db_test.cc +61 -28
- package/deps/rocksdb/rocksdb/db/db_test2.cc +18 -7
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +17 -0
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +61 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +130 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +7 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +1 -2
- package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -7
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -1
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +4 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +1 -1
- package/deps/rocksdb/rocksdb/db/log_reader.cc +48 -11
- package/deps/rocksdb/rocksdb/db/log_reader.h +8 -2
- package/deps/rocksdb/rocksdb/db/log_test.cc +10 -1
- package/deps/rocksdb/rocksdb/db/log_writer.cc +7 -1
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/memtable.cc +49 -14
- package/deps/rocksdb/rocksdb/db/memtable.h +60 -14
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +14 -8
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +30 -10
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/pinned_iterators_manager.h +5 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +2 -3
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +3 -7
- package/deps/rocksdb/rocksdb/db/table_cache.cc +72 -0
- package/deps/rocksdb/rocksdb/db/table_cache.h +19 -1
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +8 -14
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +2 -2
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +35 -64
- package/deps/rocksdb/rocksdb/db/version_edit.cc +3 -32
- package/deps/rocksdb/rocksdb/db/version_edit.h +2 -12
- package/deps/rocksdb/rocksdb/db/version_edit_test.cc +10 -23
- package/deps/rocksdb/rocksdb/db/version_set.cc +34 -10
- package/deps/rocksdb/rocksdb/db/version_set.h +3 -3
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -6
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +17 -15
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +0 -4
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +137 -42
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +21 -0
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +1 -0
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/write_thread.cc +51 -46
- package/deps/rocksdb/rocksdb/db/write_thread.h +0 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +6 -0
- package/deps/rocksdb/rocksdb/env/env_posix.cc +1 -1
- package/deps/rocksdb/rocksdb/env/env_test.cc +38 -8
- package/deps/rocksdb/rocksdb/env/file_system.cc +20 -0
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +2 -46
- package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -0
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +110 -5
- package/deps/rocksdb/rocksdb/file/writable_file_writer.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +10 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +2 -0
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -0
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +9 -13
- package/deps/rocksdb/rocksdb/logging/env_logger.h +39 -13
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +1 -1
- package/deps/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc +1 -1
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +6 -0
- package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +4 -1
- package/deps/rocksdb/rocksdb/options/cf_options.cc +6 -3
- package/deps/rocksdb/rocksdb/options/cf_options.h +6 -5
- package/deps/rocksdb/rocksdb/options/options_helper.cc +2 -1
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +1 -0
- package/deps/rocksdb/rocksdb/options/options_test.cc +4 -2
- package/deps/rocksdb/rocksdb/port/util_logger.h +1 -3
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +50 -8
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +7 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +28 -10
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +1 -0
- package/deps/rocksdb/rocksdb/table/get_context.cc +16 -6
- package/deps/rocksdb/rocksdb/table/table_reader.h +9 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +14 -1
- package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +5 -2
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +7 -8
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +6 -6
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -1
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +2 -0
- package/deps/rocksdb/rocksdb/util/stderr_logger.h +13 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +55 -46
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +10 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +2 -2
- package/index.js +2 -2
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
- package/deps/rocksdb/rocksdb/logging/posix_logger.h +0 -179
|
@@ -6676,7 +6676,7 @@ TEST_P(RenameCurrentTest, Compaction) {
|
|
|
6676
6676
|
ASSERT_EQ("d_value", Get("d"));
|
|
6677
6677
|
}
|
|
6678
6678
|
|
|
6679
|
-
TEST_F(DBTest2,
|
|
6679
|
+
TEST_F(DBTest2, LastLevelTemperature) {
|
|
6680
6680
|
class TestListener : public EventListener {
|
|
6681
6681
|
public:
|
|
6682
6682
|
void OnFileReadFinish(const FileOperationInfo& info) override {
|
|
@@ -6730,11 +6730,16 @@ TEST_F(DBTest2, BottommostTemperature) {
|
|
|
6730
6730
|
port::Mutex mutex_;
|
|
6731
6731
|
};
|
|
6732
6732
|
|
|
6733
|
+
const int kNumLevels = 7;
|
|
6734
|
+
const int kLastLevel = kNumLevels - 1;
|
|
6735
|
+
|
|
6733
6736
|
auto* listener = new TestListener();
|
|
6734
6737
|
|
|
6735
6738
|
Options options = CurrentOptions();
|
|
6736
6739
|
options.bottommost_temperature = Temperature::kWarm;
|
|
6737
6740
|
options.level0_file_num_compaction_trigger = 2;
|
|
6741
|
+
options.level_compaction_dynamic_level_bytes = true;
|
|
6742
|
+
options.num_levels = kNumLevels;
|
|
6738
6743
|
options.statistics = CreateDBStatistics();
|
|
6739
6744
|
options.listeners.emplace_back(listener);
|
|
6740
6745
|
Reopen(options);
|
|
@@ -6760,7 +6765,7 @@ TEST_F(DBTest2, BottommostTemperature) {
|
|
|
6760
6765
|
ColumnFamilyMetaData metadata;
|
|
6761
6766
|
db_->GetColumnFamilyMetaData(&metadata);
|
|
6762
6767
|
ASSERT_EQ(1, metadata.file_count);
|
|
6763
|
-
SstFileMetaData meta = metadata.levels[
|
|
6768
|
+
SstFileMetaData meta = metadata.levels[kLastLevel].files[0];
|
|
6764
6769
|
ASSERT_EQ(Temperature::kWarm, meta.temperature);
|
|
6765
6770
|
uint64_t number;
|
|
6766
6771
|
FileType type;
|
|
@@ -6818,7 +6823,7 @@ TEST_F(DBTest2, BottommostTemperature) {
|
|
|
6818
6823
|
ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
|
|
6819
6824
|
ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
|
|
6820
6825
|
|
|
6821
|
-
meta = metadata.levels[
|
|
6826
|
+
meta = metadata.levels[kLastLevel].files[0];
|
|
6822
6827
|
ASSERT_EQ(Temperature::kWarm, meta.temperature);
|
|
6823
6828
|
ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
|
|
6824
6829
|
ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
|
|
@@ -6837,7 +6842,7 @@ TEST_F(DBTest2, BottommostTemperature) {
|
|
|
6837
6842
|
ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
|
|
6838
6843
|
ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
|
|
6839
6844
|
|
|
6840
|
-
meta = metadata.levels[
|
|
6845
|
+
meta = metadata.levels[kLastLevel].files[0];
|
|
6841
6846
|
ASSERT_EQ(Temperature::kWarm, meta.temperature);
|
|
6842
6847
|
ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
|
|
6843
6848
|
ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
|
|
@@ -6865,13 +6870,13 @@ TEST_F(DBTest2, BottommostTemperature) {
|
|
|
6865
6870
|
ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
|
|
6866
6871
|
ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
|
|
6867
6872
|
|
|
6868
|
-
meta = metadata.levels[
|
|
6873
|
+
meta = metadata.levels[kLastLevel].files[0];
|
|
6869
6874
|
ASSERT_EQ(Temperature::kWarm, meta.temperature);
|
|
6870
6875
|
ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
|
|
6871
6876
|
ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
|
|
6872
6877
|
}
|
|
6873
6878
|
|
|
6874
|
-
TEST_F(DBTest2,
|
|
6879
|
+
TEST_F(DBTest2, LastLevelTemperatureUniversal) {
|
|
6875
6880
|
const int kTriggerNum = 3;
|
|
6876
6881
|
const int kNumLevels = 5;
|
|
6877
6882
|
const int kBottommostLevel = kNumLevels - 1;
|
|
@@ -6997,7 +7002,7 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) {
|
|
|
6997
7002
|
ASSERT_EQ(std::atoi(prop.c_str()), 0);
|
|
6998
7003
|
|
|
6999
7004
|
// Update bottommost temperature dynamically with SetOptions
|
|
7000
|
-
auto s = db_->SetOptions({{"
|
|
7005
|
+
auto s = db_->SetOptions({{"last_level_temperature", "kCold"}});
|
|
7001
7006
|
ASSERT_OK(s);
|
|
7002
7007
|
ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold);
|
|
7003
7008
|
db_->GetColumnFamilyMetaData(&metadata);
|
|
@@ -7097,6 +7102,9 @@ TEST_F(DBTest2, CheckpointFileTemperature) {
|
|
|
7097
7102
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
|
|
7098
7103
|
Options options = CurrentOptions();
|
|
7099
7104
|
options.bottommost_temperature = Temperature::kWarm;
|
|
7105
|
+
// set dynamic_level to true so the compaction would compact the data to the
|
|
7106
|
+
// last level directly which will have the last_level_temperature
|
|
7107
|
+
options.level_compaction_dynamic_level_bytes = true;
|
|
7100
7108
|
options.level0_file_num_compaction_trigger = 2;
|
|
7101
7109
|
options.env = env.get();
|
|
7102
7110
|
Reopen(options);
|
|
@@ -7153,6 +7161,9 @@ TEST_F(DBTest2, FileTemperatureManifestFixup) {
|
|
|
7153
7161
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
|
|
7154
7162
|
Options options = CurrentOptions();
|
|
7155
7163
|
options.bottommost_temperature = Temperature::kWarm;
|
|
7164
|
+
// set dynamic_level to true so the compaction would compact the data to the
|
|
7165
|
+
// last level directly which will have the last_level_temperature
|
|
7166
|
+
options.level_compaction_dynamic_level_bytes = true;
|
|
7156
7167
|
options.level0_file_num_compaction_trigger = 2;
|
|
7157
7168
|
options.env = env.get();
|
|
7158
7169
|
std::vector<std::string> cfs = {/*"default",*/ "test1", "test2"};
|
|
@@ -1523,8 +1523,25 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
|
|
|
1523
1523
|
/*wait=*/false, /*allow_write_stall=*/true, handles_[0]));
|
|
1524
1524
|
|
|
1525
1525
|
bool called = false;
|
|
1526
|
+
std::atomic<int> bg_flush_threads{0};
|
|
1527
|
+
std::atomic<bool> wal_synced{false};
|
|
1526
1528
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
1527
1529
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
1530
|
+
SyncPoint::GetInstance()->SetCallBack(
|
|
1531
|
+
"DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
|
|
1532
|
+
int cur = bg_flush_threads.load();
|
|
1533
|
+
int desired = cur + 1;
|
|
1534
|
+
if (cur > 0 ||
|
|
1535
|
+
!bg_flush_threads.compare_exchange_strong(cur, desired)) {
|
|
1536
|
+
while (!wal_synced.load()) {
|
|
1537
|
+
// Wait until the other bg flush thread finishes committing WAL sync
|
|
1538
|
+
// operation to the MANIFEST.
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
});
|
|
1542
|
+
SyncPoint::GetInstance()->SetCallBack(
|
|
1543
|
+
"DBImpl::FlushMemTableToOutputFile:CommitWal:1",
|
|
1544
|
+
[&](void* /*arg*/) { wal_synced.store(true); });
|
|
1528
1545
|
// This callback will be called when the first bg flush thread reaches the
|
|
1529
1546
|
// point before entering the MANIFEST write queue after flushing the SST
|
|
1530
1547
|
// file.
|
|
@@ -109,6 +109,67 @@ TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
|
|
|
109
109
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
+
TEST_F(TimestampCompatibleCompactionTest, MultipleSubCompactions) {
|
|
113
|
+
Options options = CurrentOptions();
|
|
114
|
+
options.env = env_;
|
|
115
|
+
options.compaction_style = kCompactionStyleUniversal;
|
|
116
|
+
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
117
|
+
options.level0_file_num_compaction_trigger = 3;
|
|
118
|
+
options.max_subcompactions = 3;
|
|
119
|
+
options.target_file_size_base = 1024;
|
|
120
|
+
options.statistics = CreateDBStatistics();
|
|
121
|
+
DestroyAndReopen(options);
|
|
122
|
+
|
|
123
|
+
uint64_t ts = 100;
|
|
124
|
+
uint64_t key = 0;
|
|
125
|
+
WriteOptions write_opts;
|
|
126
|
+
|
|
127
|
+
// Write keys 0, 1, ..., 499 with ts from 100 to 599.
|
|
128
|
+
{
|
|
129
|
+
for (; key <= 499; ++key, ++ts) {
|
|
130
|
+
std::string ts_str = Timestamp(ts);
|
|
131
|
+
ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
|
|
132
|
+
"foo_" + std::to_string(key)));
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Write keys 500, ..., 999 with ts from 600 to 1099.
|
|
137
|
+
{
|
|
138
|
+
for (; key <= 999; ++key, ++ts) {
|
|
139
|
+
std::string ts_str = Timestamp(ts);
|
|
140
|
+
ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
|
|
141
|
+
"foo_" + std::to_string(key)));
|
|
142
|
+
}
|
|
143
|
+
ASSERT_OK(Flush());
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Wait for compaction to finish
|
|
147
|
+
{
|
|
148
|
+
ASSERT_OK(dbfull()->RunManualCompaction(
|
|
149
|
+
static_cast_with_check<ColumnFamilyHandleImpl>(
|
|
150
|
+
db_->DefaultColumnFamily())
|
|
151
|
+
->cfd(),
|
|
152
|
+
0 /* input_level */, 1 /* output_level */, CompactRangeOptions(),
|
|
153
|
+
nullptr /* begin */, nullptr /* end */, true /* exclusive */,
|
|
154
|
+
true /* disallow_trivial_move */,
|
|
155
|
+
std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
|
|
156
|
+
"" /*trim_ts*/));
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Check stats to make sure multiple subcompactions were scheduled for
|
|
160
|
+
// boundaries not to be nullptr.
|
|
161
|
+
{
|
|
162
|
+
HistogramData num_sub_compactions;
|
|
163
|
+
options.statistics->histogramData(NUM_SUBCOMPACTIONS_SCHEDULED,
|
|
164
|
+
&num_sub_compactions);
|
|
165
|
+
ASSERT_GT(num_sub_compactions.sum, 1);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
for (key = 0; key <= 999; ++key) {
|
|
169
|
+
ASSERT_EQ("foo_" + std::to_string(key), Get(Key1(key), ts));
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
112
173
|
} // namespace ROCKSDB_NAMESPACE
|
|
113
174
|
|
|
114
175
|
int main(int argc, char** argv) {
|
|
@@ -31,6 +31,12 @@ class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
|
|
|
31
31
|
void Open() { DBTestBase::Reopen(GetOptions()); }
|
|
32
32
|
};
|
|
33
33
|
|
|
34
|
+
class DBWriteTestUnparameterized : public DBTestBase {
|
|
35
|
+
public:
|
|
36
|
+
explicit DBWriteTestUnparameterized()
|
|
37
|
+
: DBTestBase("pipelined_write_test", /*env_do_fsync=*/false) {}
|
|
38
|
+
};
|
|
39
|
+
|
|
34
40
|
// It is invalid to do sync write while disabling WAL.
|
|
35
41
|
TEST_P(DBWriteTest, SyncAndDisableWAL) {
|
|
36
42
|
WriteOptions write_options;
|
|
@@ -318,6 +324,130 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
|
|
|
318
324
|
Close();
|
|
319
325
|
}
|
|
320
326
|
|
|
327
|
+
TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) {
|
|
328
|
+
// This test was written to trigger a race in ExitAsBatchGroupLeader in case
|
|
329
|
+
// enable_pipelined_write_ was true.
|
|
330
|
+
// Writers for which ShouldWriteToMemtable() evaluates to false are removed
|
|
331
|
+
// from the write_group via CompleteFollower/ CompleteLeader. Writers in the
|
|
332
|
+
// middle of the group are fully unlinked, but if that writers is the
|
|
333
|
+
// last_writer, then we did not update the predecessor's link_older, i.e.,
|
|
334
|
+
// this writer was still reachable via newest_writer_.
|
|
335
|
+
//
|
|
336
|
+
// But the problem was, that CompleteFollower already wakes up the thread
|
|
337
|
+
// owning that writer before the writer has been removed. This resulted in a
|
|
338
|
+
// race - if the leader thread was fast enough, then everything was fine.
|
|
339
|
+
// However, if the woken up thread finished the current write operation and
|
|
340
|
+
// then performed yet another write, then a new writer instance was added
|
|
341
|
+
// to newest_writer_. It is possible that the new writer is located on the
|
|
342
|
+
// same address on stack, and if this happened, then we had a problem,
|
|
343
|
+
// because the old code tried to find the last_writer in the list to unlink
|
|
344
|
+
// it, which in this case produced a cycle in the list.
|
|
345
|
+
// Whether two invocations of PipelinedWriteImpl() by the same thread actually
|
|
346
|
+
// allocate the writer on the same address depends on the OS and/or compiler,
|
|
347
|
+
// so it is rather hard to create a deterministic test for this.
|
|
348
|
+
|
|
349
|
+
Options options = GetDefaultOptions();
|
|
350
|
+
options.create_if_missing = true;
|
|
351
|
+
options.enable_pipelined_write = true;
|
|
352
|
+
std::vector<port::Thread> threads;
|
|
353
|
+
|
|
354
|
+
std::atomic<int> write_counter{0};
|
|
355
|
+
std::atomic<int> active_writers{0};
|
|
356
|
+
std::atomic<bool> second_write_starting{false};
|
|
357
|
+
std::atomic<bool> second_write_in_progress{false};
|
|
358
|
+
std::atomic<WriteThread::Writer*> leader{nullptr};
|
|
359
|
+
std::atomic<bool> finished_WAL_write{false};
|
|
360
|
+
|
|
361
|
+
DestroyAndReopen(options);
|
|
362
|
+
|
|
363
|
+
auto write_one_doc = [&]() {
|
|
364
|
+
int a = write_counter.fetch_add(1);
|
|
365
|
+
std::string key = "foo" + std::to_string(a);
|
|
366
|
+
WriteOptions wo;
|
|
367
|
+
ASSERT_OK(dbfull()->Put(wo, key, "bar"));
|
|
368
|
+
--active_writers;
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
auto write_two_docs = [&]() {
|
|
372
|
+
write_one_doc();
|
|
373
|
+
second_write_starting = true;
|
|
374
|
+
write_one_doc();
|
|
375
|
+
};
|
|
376
|
+
|
|
377
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
378
|
+
"WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
|
|
379
|
+
if (second_write_starting.load()) {
|
|
380
|
+
second_write_in_progress = true;
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
|
384
|
+
if (w->state == WriteThread::STATE_GROUP_LEADER) {
|
|
385
|
+
active_writers++;
|
|
386
|
+
if (leader.load() == nullptr) {
|
|
387
|
+
leader.store(w);
|
|
388
|
+
while (active_writers.load() < 2) {
|
|
389
|
+
// wait for another thread to join the write_group
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
} else {
|
|
393
|
+
// we disable the memtable for all followers so that they they are
|
|
394
|
+
// removed from the write_group before enqueuing it for the memtable
|
|
395
|
+
// write
|
|
396
|
+
w->disable_memtable = true;
|
|
397
|
+
active_writers++;
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
402
|
+
"WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) {
|
|
403
|
+
auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
|
|
404
|
+
if (wg->leader == leader && !finished_WAL_write) {
|
|
405
|
+
finished_WAL_write = true;
|
|
406
|
+
while (active_writers.load() < 3) {
|
|
407
|
+
// wait for the new writer to be enqueued
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
});
|
|
411
|
+
|
|
412
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
413
|
+
"WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
|
|
414
|
+
[&](void* arg) {
|
|
415
|
+
auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
|
|
416
|
+
if (wg->leader == leader) {
|
|
417
|
+
while (!second_write_in_progress.load()) {
|
|
418
|
+
// wait for the old follower thread to start the next write
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
});
|
|
422
|
+
|
|
423
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
424
|
+
|
|
425
|
+
// start leader + one follower
|
|
426
|
+
threads.emplace_back(write_one_doc);
|
|
427
|
+
while (leader.load() == nullptr) {
|
|
428
|
+
// wait for leader
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// we perform two writes in the follower, so that for the second write
|
|
432
|
+
// the thread reinserts a Writer with the same address
|
|
433
|
+
threads.emplace_back(write_two_docs);
|
|
434
|
+
|
|
435
|
+
// wait for the leader to enter ExitAsBatchGroupLeader
|
|
436
|
+
while (!finished_WAL_write.load()) {
|
|
437
|
+
// wait for write_group to have finished the WAL writes
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// start another writer thread to be enqueued before the leader can
|
|
441
|
+
// complete the writers from its write_group
|
|
442
|
+
threads.emplace_back(write_one_doc);
|
|
443
|
+
|
|
444
|
+
for (auto& t : threads) {
|
|
445
|
+
t.join();
|
|
446
|
+
}
|
|
447
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
448
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
449
|
+
}
|
|
450
|
+
|
|
321
451
|
TEST_P(DBWriteTest, ManualWalFlushInEffect) {
|
|
322
452
|
Options options = GetOptions();
|
|
323
453
|
Reopen(options);
|
|
@@ -105,14 +105,13 @@ Status UpdateManifestForFilesState(
|
|
|
105
105
|
// Current state inconsistent with manifest
|
|
106
106
|
++files_updated;
|
|
107
107
|
edit.DeleteFile(level, number);
|
|
108
|
-
edit.AddFile(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
lf->min_timestamp, lf->max_timestamp, lf->unique_id);
|
|
108
|
+
edit.AddFile(
|
|
109
|
+
level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
|
|
110
|
+
lf->smallest, lf->largest, lf->fd.smallest_seqno,
|
|
111
|
+
lf->fd.largest_seqno, lf->marked_for_compaction, temp,
|
|
112
|
+
lf->oldest_blob_file_number, lf->oldest_ancester_time,
|
|
113
|
+
lf->file_creation_time, lf->file_checksum,
|
|
114
|
+
lf->file_checksum_func_name, lf->unique_id);
|
|
116
115
|
}
|
|
117
116
|
}
|
|
118
117
|
} else {
|
|
@@ -450,8 +450,7 @@ Status ExternalSstFileIngestionJob::Run() {
|
|
|
450
450
|
f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
|
|
451
451
|
f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
|
|
452
452
|
oldest_ancester_time, current_time, f.file_checksum,
|
|
453
|
-
f.file_checksum_func_name,
|
|
454
|
-
f.unique_id);
|
|
453
|
+
f.file_checksum_func_name, f.unique_id);
|
|
455
454
|
f_metadata.temperature = f.file_temperature;
|
|
456
455
|
edit_.AddFile(f.picked_level, f_metadata);
|
|
457
456
|
}
|
|
@@ -390,7 +390,8 @@ Status FlushJob::MemPurge() {
|
|
|
390
390
|
range_del_iters;
|
|
391
391
|
for (MemTable* m : mems_) {
|
|
392
392
|
memtables.push_back(m->NewIterator(ro, &arena));
|
|
393
|
-
auto* range_del_iter = m->NewRangeTombstoneIterator(
|
|
393
|
+
auto* range_del_iter = m->NewRangeTombstoneIterator(
|
|
394
|
+
ro, kMaxSequenceNumber, true /* immutable_memtable */);
|
|
394
395
|
if (range_del_iter != nullptr) {
|
|
395
396
|
range_del_iters.emplace_back(range_del_iter);
|
|
396
397
|
}
|
|
@@ -585,6 +586,8 @@ Status FlushJob::MemPurge() {
|
|
|
585
586
|
// as in need of being flushed.
|
|
586
587
|
if (new_mem->ApproximateMemoryUsage() < maxSize &&
|
|
587
588
|
!(new_mem->ShouldFlushNow())) {
|
|
589
|
+
// Construct fragmented memtable range tombstones without mutex
|
|
590
|
+
new_mem->ConstructFragmentedRangeTombstones();
|
|
588
591
|
db_mutex_->Lock();
|
|
589
592
|
uint64_t new_mem_id = mems_[0]->GetID();
|
|
590
593
|
|
|
@@ -732,7 +735,8 @@ bool FlushJob::MemPurgeDecider(double threshold) {
|
|
|
732
735
|
|
|
733
736
|
// Estimate if the sample entry is valid or not.
|
|
734
737
|
get_res = mt->Get(lkey, &vget, nullptr, &mget_s, &merge_context,
|
|
735
|
-
&max_covering_tombstone_seq, &sqno, ro
|
|
738
|
+
&max_covering_tombstone_seq, &sqno, ro,
|
|
739
|
+
true /* immutable_memtable */);
|
|
736
740
|
if (!get_res) {
|
|
737
741
|
ROCKS_LOG_WARN(
|
|
738
742
|
db_options_.info_log,
|
|
@@ -773,7 +777,8 @@ bool FlushJob::MemPurgeDecider(double threshold) {
|
|
|
773
777
|
next_mem_iter != std::end(mems_); next_mem_iter++) {
|
|
774
778
|
if ((*next_mem_iter)
|
|
775
779
|
->Get(lkey, &vget, nullptr, &mget_s, &merge_context,
|
|
776
|
-
&max_covering_tombstone_seq, &sqno, ro
|
|
780
|
+
&max_covering_tombstone_seq, &sqno, ro,
|
|
781
|
+
true /* immutable_memtable */)) {
|
|
777
782
|
not_in_next_mems = false;
|
|
778
783
|
break;
|
|
779
784
|
}
|
|
@@ -857,8 +862,8 @@ Status FlushJob::WriteLevel0Table() {
|
|
|
857
862
|
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
|
|
858
863
|
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
|
|
859
864
|
memtables.push_back(m->NewIterator(ro, &arena));
|
|
860
|
-
auto* range_del_iter =
|
|
861
|
-
|
|
865
|
+
auto* range_del_iter = m->NewRangeTombstoneIterator(
|
|
866
|
+
ro, kMaxSequenceNumber, true /* immutable_memtable */);
|
|
862
867
|
if (range_del_iter != nullptr) {
|
|
863
868
|
range_del_iters.emplace_back(range_del_iter);
|
|
864
869
|
}
|
|
@@ -1000,8 +1005,7 @@ Status FlushJob::WriteLevel0Table() {
|
|
|
1000
1005
|
meta_.marked_for_compaction, meta_.temperature,
|
|
1001
1006
|
meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
|
|
1002
1007
|
meta_.file_creation_time, meta_.file_checksum,
|
|
1003
|
-
meta_.file_checksum_func_name, meta_.
|
|
1004
|
-
meta_.max_timestamp, meta_.unique_id);
|
|
1008
|
+
meta_.file_checksum_func_name, meta_.unique_id);
|
|
1005
1009
|
|
|
1006
1010
|
edit_->SetBlobFileAdditions(std::move(blob_file_additions));
|
|
1007
1011
|
}
|
|
@@ -242,6 +242,7 @@ TEST_F(FlushJobTest, NonEmpty) {
|
|
|
242
242
|
mock::SortKVVector(&inserted_keys);
|
|
243
243
|
|
|
244
244
|
autovector<MemTable*> to_delete;
|
|
245
|
+
new_mem->ConstructFragmentedRangeTombstones();
|
|
245
246
|
cfd->imm()->Add(new_mem, &to_delete);
|
|
246
247
|
for (auto& m : to_delete) {
|
|
247
248
|
delete m;
|
|
@@ -303,6 +304,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
|
|
|
303
304
|
|
|
304
305
|
autovector<MemTable*> to_delete;
|
|
305
306
|
for (auto mem : new_mems) {
|
|
307
|
+
mem->ConstructFragmentedRangeTombstones();
|
|
306
308
|
cfd->imm()->Add(mem, &to_delete);
|
|
307
309
|
}
|
|
308
310
|
|
|
@@ -372,7 +374,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
|
|
|
372
374
|
ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value,
|
|
373
375
|
nullptr /* kv_prot_info */));
|
|
374
376
|
}
|
|
375
|
-
|
|
377
|
+
mem->ConstructFragmentedRangeTombstones();
|
|
376
378
|
cfd->imm()->Add(mem, &to_delete);
|
|
377
379
|
}
|
|
378
380
|
largest_seqs.push_back(curr_seqno - 1);
|
|
@@ -505,6 +507,7 @@ TEST_F(FlushJobTest, Snapshots) {
|
|
|
505
507
|
mock::SortKVVector(&inserted_keys);
|
|
506
508
|
|
|
507
509
|
autovector<MemTable*> to_delete;
|
|
510
|
+
new_mem->ConstructFragmentedRangeTombstones();
|
|
508
511
|
cfd->imm()->Add(new_mem, &to_delete);
|
|
509
512
|
for (auto& m : to_delete) {
|
|
510
513
|
delete m;
|
|
@@ -559,6 +562,7 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
|
|
|
559
562
|
|
|
560
563
|
autovector<MemTable*> to_delete;
|
|
561
564
|
for (auto mem : new_mems) {
|
|
565
|
+
mem->ConstructFragmentedRangeTombstones();
|
|
562
566
|
cfd->imm()->Add(mem, &to_delete);
|
|
563
567
|
}
|
|
564
568
|
|
|
@@ -638,6 +642,7 @@ TEST_F(FlushJobTimestampTest, AllKeysExpired) {
|
|
|
638
642
|
SequenceNumber seq = (curr_seq_++);
|
|
639
643
|
AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
|
|
640
644
|
ValueType::kTypeDeletionWithTimestamp, "");
|
|
645
|
+
new_mem->ConstructFragmentedRangeTombstones();
|
|
641
646
|
cfd->imm()->Add(new_mem, &to_delete);
|
|
642
647
|
}
|
|
643
648
|
|
|
@@ -690,6 +695,7 @@ TEST_F(FlushJobTimestampTest, NoKeyExpired) {
|
|
|
690
695
|
AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
|
|
691
696
|
ValueType::kTypeValue, "0_value");
|
|
692
697
|
}
|
|
698
|
+
new_mem->ConstructFragmentedRangeTombstones();
|
|
693
699
|
cfd->imm()->Add(new_mem, &to_delete);
|
|
694
700
|
}
|
|
695
701
|
|
|
@@ -668,7 +668,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
|
|
|
668
668
|
if (!read_options_.ignore_range_deletions) {
|
|
669
669
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
670
670
|
sv_->mem->NewRangeTombstoneIterator(
|
|
671
|
-
read_options_, sv_->current->version_set()->LastSequence()
|
|
671
|
+
read_options_, sv_->current->version_set()->LastSequence(),
|
|
672
|
+
false /* immutable_memtable */));
|
|
672
673
|
range_del_agg.AddTombstones(std::move(range_del_iter));
|
|
673
674
|
// Always return Status::OK().
|
|
674
675
|
Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
|
|
@@ -733,7 +734,8 @@ void ForwardIterator::RenewIterators() {
|
|
|
733
734
|
if (!read_options_.ignore_range_deletions) {
|
|
734
735
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
735
736
|
svnew->mem->NewRangeTombstoneIterator(
|
|
736
|
-
read_options_, sv_->current->version_set()->LastSequence()
|
|
737
|
+
read_options_, sv_->current->version_set()->LastSequence(),
|
|
738
|
+
false /* immutable_memtable */));
|
|
737
739
|
range_del_agg.AddTombstones(std::move(range_del_iter));
|
|
738
740
|
// Always return Status::OK().
|
|
739
741
|
Status temp_s = svnew->imm->AddRangeTombstoneIterators(
|
|
@@ -160,7 +160,7 @@ Status ImportColumnFamilyJob::Run() {
|
|
|
160
160
|
file_metadata.largest_seqno, false, file_metadata.temperature,
|
|
161
161
|
kInvalidBlobFileNumber, oldest_ancester_time, current_time,
|
|
162
162
|
kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
|
163
|
-
|
|
163
|
+
f.unique_id);
|
|
164
164
|
|
|
165
165
|
// If incoming sequence number is higher, update local sequence number.
|
|
166
166
|
if (file_metadata.largest_seqno > versions_->LastSequence()) {
|
|
@@ -44,7 +44,8 @@ Reader::Reader(std::shared_ptr<Logger> info_log,
|
|
|
44
44
|
compression_type_(kNoCompression),
|
|
45
45
|
compression_type_record_read_(false),
|
|
46
46
|
uncompress_(nullptr),
|
|
47
|
-
hash_state_(nullptr)
|
|
47
|
+
hash_state_(nullptr),
|
|
48
|
+
uncompress_hash_state_(nullptr){};
|
|
48
49
|
|
|
49
50
|
Reader::~Reader() {
|
|
50
51
|
delete[] backing_store_;
|
|
@@ -54,6 +55,9 @@ Reader::~Reader() {
|
|
|
54
55
|
if (hash_state_) {
|
|
55
56
|
XXH3_freeState(hash_state_);
|
|
56
57
|
}
|
|
58
|
+
if (uncompress_hash_state_) {
|
|
59
|
+
XXH3_freeState(uncompress_hash_state_);
|
|
60
|
+
}
|
|
57
61
|
}
|
|
58
62
|
|
|
59
63
|
// For kAbsoluteConsistency, on clean shutdown we don't expect any error
|
|
@@ -64,10 +68,11 @@ Reader::~Reader() {
|
|
|
64
68
|
// TODO krad: Evaluate if we need to move to a more strict mode where we
|
|
65
69
|
// restrict the inconsistency to only the last log
|
|
66
70
|
bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|
67
|
-
WALRecoveryMode wal_recovery_mode,
|
|
71
|
+
WALRecoveryMode wal_recovery_mode,
|
|
72
|
+
uint64_t* record_checksum) {
|
|
68
73
|
scratch->clear();
|
|
69
74
|
record->clear();
|
|
70
|
-
if (
|
|
75
|
+
if (record_checksum != nullptr) {
|
|
71
76
|
if (hash_state_ == nullptr) {
|
|
72
77
|
hash_state_ = XXH3_createState();
|
|
73
78
|
}
|
|
@@ -85,7 +90,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|
|
85
90
|
while (true) {
|
|
86
91
|
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
|
87
92
|
size_t drop_size = 0;
|
|
88
|
-
const unsigned int record_type =
|
|
93
|
+
const unsigned int record_type =
|
|
94
|
+
ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
|
|
89
95
|
switch (record_type) {
|
|
90
96
|
case kFullType:
|
|
91
97
|
case kRecyclableFullType:
|
|
@@ -96,9 +102,12 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|
|
96
102
|
// at the beginning of the next block.
|
|
97
103
|
ReportCorruption(scratch->size(), "partial record without end(1)");
|
|
98
104
|
}
|
|
99
|
-
|
|
105
|
+
// No need to compute record_checksum since the record
|
|
106
|
+
// consists of a single fragment and the checksum is computed
|
|
107
|
+
// in ReadPhysicalRecord() if WAL compression is enabled
|
|
108
|
+
if (record_checksum != nullptr && uncompress_ == nullptr) {
|
|
100
109
|
// No need to stream since the record is a single fragment
|
|
101
|
-
*
|
|
110
|
+
*record_checksum = XXH3_64bits(fragment.data(), fragment.size());
|
|
102
111
|
}
|
|
103
112
|
prospective_record_offset = physical_record_offset;
|
|
104
113
|
scratch->clear();
|
|
@@ -117,7 +126,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|
|
117
126
|
ReportCorruption(scratch->size(), "partial record without end(2)");
|
|
118
127
|
XXH3_64bits_reset(hash_state_);
|
|
119
128
|
}
|
|
120
|
-
if (
|
|
129
|
+
if (record_checksum != nullptr) {
|
|
121
130
|
XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
|
|
122
131
|
}
|
|
123
132
|
prospective_record_offset = physical_record_offset;
|
|
@@ -131,7 +140,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|
|
131
140
|
ReportCorruption(fragment.size(),
|
|
132
141
|
"missing start of fragmented record(1)");
|
|
133
142
|
} else {
|
|
134
|
-
if (
|
|
143
|
+
if (record_checksum != nullptr) {
|
|
135
144
|
XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
|
|
136
145
|
}
|
|
137
146
|
scratch->append(fragment.data(), fragment.size());
|
|
@@ -144,9 +153,9 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
|
|
144
153
|
ReportCorruption(fragment.size(),
|
|
145
154
|
"missing start of fragmented record(2)");
|
|
146
155
|
} else {
|
|
147
|
-
if (
|
|
156
|
+
if (record_checksum != nullptr) {
|
|
148
157
|
XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
|
|
149
|
-
*
|
|
158
|
+
*record_checksum = XXH3_64bits_digest(hash_state_);
|
|
150
159
|
}
|
|
151
160
|
scratch->append(fragment.data(), fragment.size());
|
|
152
161
|
*record = Slice(*scratch);
|
|
@@ -417,7 +426,8 @@ bool Reader::ReadMore(size_t* drop_size, int *error) {
|
|
|
417
426
|
}
|
|
418
427
|
}
|
|
419
428
|
|
|
420
|
-
unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size
|
|
429
|
+
unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size,
|
|
430
|
+
uint64_t* fragment_checksum) {
|
|
421
431
|
while (true) {
|
|
422
432
|
// We need at least the minimum header size
|
|
423
433
|
if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
|
|
@@ -500,6 +510,13 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
|
|
|
500
510
|
} else {
|
|
501
511
|
// Uncompress compressed records
|
|
502
512
|
uncompressed_record_.clear();
|
|
513
|
+
if (fragment_checksum != nullptr) {
|
|
514
|
+
if (uncompress_hash_state_ == nullptr) {
|
|
515
|
+
uncompress_hash_state_ = XXH3_createState();
|
|
516
|
+
}
|
|
517
|
+
XXH3_64bits_reset(uncompress_hash_state_);
|
|
518
|
+
}
|
|
519
|
+
|
|
503
520
|
size_t uncompressed_size = 0;
|
|
504
521
|
int remaining = 0;
|
|
505
522
|
do {
|
|
@@ -511,10 +528,30 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
|
|
|
511
528
|
return kBadRecord;
|
|
512
529
|
}
|
|
513
530
|
if (uncompressed_size > 0) {
|
|
531
|
+
if (fragment_checksum != nullptr) {
|
|
532
|
+
XXH3_64bits_update(uncompress_hash_state_,
|
|
533
|
+
uncompressed_buffer_.get(), uncompressed_size);
|
|
534
|
+
}
|
|
514
535
|
uncompressed_record_.append(uncompressed_buffer_.get(),
|
|
515
536
|
uncompressed_size);
|
|
516
537
|
}
|
|
517
538
|
} while (remaining > 0 || uncompressed_size == kBlockSize);
|
|
539
|
+
|
|
540
|
+
if (fragment_checksum != nullptr) {
|
|
541
|
+
// We can remove this check by updating hash_state_ directly,
|
|
542
|
+
// but that requires resetting hash_state_ for full and first types
|
|
543
|
+
// for edge cases like consecutive fist type records.
|
|
544
|
+
// Leaving the check as is since it is cleaner and can revert to the
|
|
545
|
+
// above approach if it causes performance impact.
|
|
546
|
+
*fragment_checksum = XXH3_64bits_digest(uncompress_hash_state_);
|
|
547
|
+
uint64_t actual_checksum = XXH3_64bits(uncompressed_record_.data(),
|
|
548
|
+
uncompressed_record_.size());
|
|
549
|
+
if (*fragment_checksum != actual_checksum) {
|
|
550
|
+
// uncompressed_record_ contains bad content that does not match
|
|
551
|
+
// actual decompressed content
|
|
552
|
+
return kBadRecord;
|
|
553
|
+
}
|
|
554
|
+
}
|
|
518
555
|
*result = Slice(uncompressed_record_);
|
|
519
556
|
return type;
|
|
520
557
|
}
|