@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +32 -2
- package/binding.gyp +8 -0
- package/deps/liburing/liburing.gyp +20 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
- package/deps/rocksdb/rocksdb/TARGETS +7 -0
- package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
- package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
- package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
- package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
- package/deps/rocksdb/rocksdb/db/c.cc +90 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
- package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
- package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
- package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
- package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
- package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
- package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
- package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
- package/deps/rocksdb/rocksdb/src.mk +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
- package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
- package/deps/rocksdb/rocksdb.gyp +7 -1
- package/package.json +1 -1
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -167,9 +167,15 @@ class CompactionOutputs {
|
|
|
167
167
|
current_output_file_size_ = 0;
|
|
168
168
|
}
|
|
169
169
|
|
|
170
|
-
// Add range
|
|
170
|
+
// Add range deletions from the range_del_agg_ to the current output file.
|
|
171
|
+
// Input parameters, `range_tombstone_lower_bound_` and current output's
|
|
172
|
+
// metadata determine the bounds on range deletions to add. Updates output
|
|
173
|
+
// file metadata boundary if extended by range tombstones.
|
|
174
|
+
//
|
|
171
175
|
// @param comp_start_user_key and comp_end_user_key include timestamp if
|
|
172
|
-
// user-defined timestamp is enabled.
|
|
176
|
+
// user-defined timestamp is enabled. Their timestamp should be max timestamp.
|
|
177
|
+
// @param next_table_min_key internal key lower bound for the next compaction
|
|
178
|
+
// output.
|
|
173
179
|
// @param full_history_ts_low used for range tombstone garbage collection.
|
|
174
180
|
Status AddRangeDels(const Slice* comp_start_user_key,
|
|
175
181
|
const Slice* comp_end_user_key,
|
|
@@ -314,6 +320,7 @@ class CompactionOutputs {
|
|
|
314
320
|
std::unique_ptr<SstPartitioner> partitioner_;
|
|
315
321
|
|
|
316
322
|
// A flag determines if this subcompaction has been split by the cursor
|
|
323
|
+
// for RoundRobin compaction
|
|
317
324
|
bool is_split_ = false;
|
|
318
325
|
|
|
319
326
|
// We also maintain the output split key for each subcompaction to avoid
|
|
@@ -345,6 +352,10 @@ class CompactionOutputs {
|
|
|
345
352
|
// for the current output file, how many file boundaries has it crossed,
|
|
346
353
|
// basically number of files overlapped * 2
|
|
347
354
|
size_t grandparent_boundary_switched_num_ = 0;
|
|
355
|
+
|
|
356
|
+
// The smallest key of the current output file, this is set when current
|
|
357
|
+
// output file's smallest key is a range tombstone start key.
|
|
358
|
+
InternalKey range_tombstone_lower_bound_;
|
|
348
359
|
};
|
|
349
360
|
|
|
350
361
|
// helper struct to concatenate the last level and penultimate level outputs
|
|
@@ -84,6 +84,11 @@ class SubcompactionState {
|
|
|
84
84
|
// Assign range dels aggregator, for each range_del, it can only be assigned
|
|
85
85
|
// to one output level, for per_key_placement, it's going to be the
|
|
86
86
|
// penultimate level.
|
|
87
|
+
// TODO: This does not work for per_key_placement + user-defined timestamp +
|
|
88
|
+
// DeleteRange() combo. If user-defined timestamp is enabled,
|
|
89
|
+
// it is possible for a range tombstone to belong to bottommost level (
|
|
90
|
+
// seqno < earliest snapshot) without being dropped (garbage collection
|
|
91
|
+
// for user-defined timestamp).
|
|
87
92
|
void AssignRangeDelAggregator(
|
|
88
93
|
std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
|
|
89
94
|
if (compaction->SupportsPerKeyPlacement()) {
|
|
@@ -2302,9 +2302,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL0) {
|
|
|
2302
2302
|
ASSERT_EQ(multiget_io_batch_size.count, 3);
|
|
2303
2303
|
}
|
|
2304
2304
|
#else // ROCKSDB_IOURING_PRESENT
|
|
2305
|
-
|
|
2306
|
-
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
|
|
2307
|
-
}
|
|
2305
|
+
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
|
2308
2306
|
#endif // ROCKSDB_IOURING_PRESENT
|
|
2309
2307
|
}
|
|
2310
2308
|
|
|
@@ -2338,16 +2336,18 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) {
|
|
|
2338
2336
|
ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
|
|
2339
2337
|
ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
|
|
2340
2338
|
|
|
2341
|
-
#ifdef ROCKSDB_IOURING_PRESENT
|
|
2342
2339
|
HistogramData multiget_io_batch_size;
|
|
2343
2340
|
|
|
2344
2341
|
statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
|
|
2345
2342
|
|
|
2343
|
+
#ifdef ROCKSDB_IOURING_PRESENT
|
|
2346
2344
|
// A batch of 3 async IOs is expected, one for each overlapping file in L1
|
|
2347
2345
|
ASSERT_EQ(multiget_io_batch_size.count, 1);
|
|
2348
2346
|
ASSERT_EQ(multiget_io_batch_size.max, 3);
|
|
2349
|
-
#endif // ROCKSDB_IOURING_PRESENT
|
|
2350
2347
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
|
|
2348
|
+
#else // ROCKSDB_IOURING_PRESENT
|
|
2349
|
+
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
|
2350
|
+
#endif // ROCKSDB_IOURING_PRESENT
|
|
2351
2351
|
}
|
|
2352
2352
|
|
|
2353
2353
|
#ifdef ROCKSDB_IOURING_PRESENT
|
|
@@ -2531,8 +2531,12 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
|
|
|
2531
2531
|
ASSERT_EQ(values[0], "val_l2_" + std::to_string(19));
|
|
2532
2532
|
ASSERT_EQ(values[1], "val_l2_" + std::to_string(26));
|
|
2533
2533
|
|
|
2534
|
+
#ifdef ROCKSDB_IOURING_PRESENT
|
|
2534
2535
|
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels
|
|
2535
2536
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
|
|
2537
|
+
#else // ROCKSDB_IOURING_PRESENT
|
|
2538
|
+
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
|
2539
|
+
#endif // ROCKSDB_IOURING_PRESENT
|
|
2536
2540
|
}
|
|
2537
2541
|
|
|
2538
2542
|
#ifdef ROCKSDB_IOURING_PRESENT
|
|
@@ -2623,18 +2627,17 @@ TEST_P(DBMultiGetAsyncIOTest, GetNoIOUring) {
|
|
|
2623
2627
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
2624
2628
|
keys.data(), values.data(), statuses.data());
|
|
2625
2629
|
ASSERT_EQ(values.size(), 3);
|
|
2626
|
-
ASSERT_EQ(statuses[0], Status::
|
|
2627
|
-
ASSERT_EQ(statuses[1], Status::
|
|
2628
|
-
ASSERT_EQ(statuses[2], Status::
|
|
2630
|
+
ASSERT_EQ(statuses[0], Status::OK());
|
|
2631
|
+
ASSERT_EQ(statuses[1], Status::OK());
|
|
2632
|
+
ASSERT_EQ(statuses[2], Status::OK());
|
|
2629
2633
|
|
|
2630
|
-
HistogramData
|
|
2634
|
+
HistogramData async_read_bytes;
|
|
2631
2635
|
|
|
2632
|
-
statistics()->histogramData(
|
|
2636
|
+
statistics()->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
|
2633
2637
|
|
|
2634
2638
|
// A batch of 3 async IOs is expected, one for each overlapping file in L1
|
|
2635
|
-
ASSERT_EQ(
|
|
2636
|
-
ASSERT_EQ(
|
|
2637
|
-
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
|
|
2639
|
+
ASSERT_EQ(async_read_bytes.count, 0);
|
|
2640
|
+
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
|
2638
2641
|
}
|
|
2639
2642
|
|
|
2640
2643
|
INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest,
|
|
@@ -3626,11 +3629,11 @@ class DBBasicTestMultiGet : public DBTestBase {
|
|
|
3626
3629
|
|
|
3627
3630
|
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
|
|
3628
3631
|
CreateContext* create_context,
|
|
3629
|
-
Priority priority = Priority::LOW,
|
|
3632
|
+
Priority priority = Priority::LOW,
|
|
3630
3633
|
Statistics* stats = nullptr) override {
|
|
3631
3634
|
num_lookups_++;
|
|
3632
3635
|
Handle* handle =
|
|
3633
|
-
target_->Lookup(key, helper, create_context, priority,
|
|
3636
|
+
target_->Lookup(key, helper, create_context, priority, stats);
|
|
3634
3637
|
if (handle != nullptr) {
|
|
3635
3638
|
num_found_++;
|
|
3636
3639
|
}
|
|
@@ -4491,6 +4494,63 @@ TEST_F(DBBasicTest, VerifyFileChecksums) {
|
|
|
4491
4494
|
ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
|
|
4492
4495
|
}
|
|
4493
4496
|
|
|
4497
|
+
TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) {
|
|
4498
|
+
Options options = GetDefaultOptions();
|
|
4499
|
+
options.create_if_missing = true;
|
|
4500
|
+
options.env = env_;
|
|
4501
|
+
options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
|
|
4502
|
+
DestroyAndReopen(options);
|
|
4503
|
+
|
|
4504
|
+
Random rnd(301);
|
|
4505
|
+
int alignment = 256 * 1024;
|
|
4506
|
+
for (int i = 0; i < 16; ++i) {
|
|
4507
|
+
ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(alignment)));
|
|
4508
|
+
}
|
|
4509
|
+
ASSERT_OK(Flush());
|
|
4510
|
+
|
|
4511
|
+
std::vector<std::string> filenames;
|
|
4512
|
+
int sst_cnt = 0;
|
|
4513
|
+
std::string sst_name;
|
|
4514
|
+
uint64_t sst_size;
|
|
4515
|
+
uint64_t number;
|
|
4516
|
+
FileType type;
|
|
4517
|
+
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
|
4518
|
+
for (auto name : filenames) {
|
|
4519
|
+
if (ParseFileName(name, &number, &type)) {
|
|
4520
|
+
if (type == kTableFile) {
|
|
4521
|
+
sst_cnt++;
|
|
4522
|
+
sst_name = name;
|
|
4523
|
+
}
|
|
4524
|
+
}
|
|
4525
|
+
}
|
|
4526
|
+
ASSERT_EQ(sst_cnt, 1);
|
|
4527
|
+
ASSERT_OK(env_->GetFileSize(dbname_ + '/' + sst_name, &sst_size));
|
|
4528
|
+
|
|
4529
|
+
bool last_read = false;
|
|
4530
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
4531
|
+
"GenerateOneFileChecksum::Chunk:0", [&](void* /*arg*/) {
|
|
4532
|
+
if (env_->random_read_bytes_counter_.load() == sst_size) {
|
|
4533
|
+
EXPECT_FALSE(last_read);
|
|
4534
|
+
last_read = true;
|
|
4535
|
+
} else {
|
|
4536
|
+
ASSERT_EQ(env_->random_read_bytes_counter_.load() & (alignment - 1),
|
|
4537
|
+
0);
|
|
4538
|
+
}
|
|
4539
|
+
});
|
|
4540
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
4541
|
+
env_->count_random_reads_ = true;
|
|
4542
|
+
env_->random_read_bytes_counter_ = 0;
|
|
4543
|
+
env_->random_read_counter_.Reset();
|
|
4544
|
+
|
|
4545
|
+
ReadOptions ro;
|
|
4546
|
+
ro.readahead_size = alignment;
|
|
4547
|
+
ASSERT_OK(db_->VerifyFileChecksums(ro));
|
|
4548
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
4549
|
+
ASSERT_TRUE(last_read);
|
|
4550
|
+
ASSERT_EQ(env_->random_read_counter_.Read(),
|
|
4551
|
+
(sst_size + alignment - 1) / (alignment));
|
|
4552
|
+
}
|
|
4553
|
+
|
|
4494
4554
|
// TODO: re-enable after we provide finer-grained control for WAL tracking to
|
|
4495
4555
|
// meet the needs of different use cases, durability levels and recovery modes.
|
|
4496
4556
|
TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
|
|
@@ -717,7 +717,7 @@ class LookupLiarCache : public CacheWrapper {
|
|
|
717
717
|
|
|
718
718
|
Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr,
|
|
719
719
|
CreateContext* create_context = nullptr,
|
|
720
|
-
Priority priority = Priority::LOW,
|
|
720
|
+
Priority priority = Priority::LOW,
|
|
721
721
|
Statistics* stats = nullptr) override {
|
|
722
722
|
if (nth_lookup_not_found_ == 1) {
|
|
723
723
|
nth_lookup_not_found_ = 0;
|
|
@@ -726,8 +726,7 @@ class LookupLiarCache : public CacheWrapper {
|
|
|
726
726
|
if (nth_lookup_not_found_ > 1) {
|
|
727
727
|
--nth_lookup_not_found_;
|
|
728
728
|
}
|
|
729
|
-
return CacheWrapper::Lookup(key, helper, create_context, priority,
|
|
730
|
-
stats);
|
|
729
|
+
return CacheWrapper::Lookup(key, helper, create_context, priority, stats);
|
|
731
730
|
}
|
|
732
731
|
|
|
733
732
|
// 1 == next lookup, 2 == after next, etc.
|
|
@@ -34,11 +34,8 @@ Status DBImpl::FlushForGetLiveFiles() {
|
|
|
34
34
|
// flush all dirty data to disk.
|
|
35
35
|
Status status;
|
|
36
36
|
if (immutable_db_options_.atomic_flush) {
|
|
37
|
-
autovector<ColumnFamilyData*> cfds;
|
|
38
|
-
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
39
37
|
mutex_.Unlock();
|
|
40
|
-
status =
|
|
41
|
-
AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
|
|
38
|
+
status = AtomicFlushMemTables(FlushOptions(), FlushReason::kGetLiveFiles);
|
|
42
39
|
if (status.IsColumnFamilyDropped()) {
|
|
43
40
|
status = Status::OK();
|
|
44
41
|
}
|
|
@@ -437,4 +434,3 @@ Status DBImpl::GetLiveFilesStorageInfo(
|
|
|
437
434
|
}
|
|
438
435
|
|
|
439
436
|
} // namespace ROCKSDB_NAMESPACE
|
|
440
|
-
|
|
@@ -740,7 +740,97 @@ class TestFlushListener : public EventListener {
|
|
|
740
740
|
DBFlushTest* test_;
|
|
741
741
|
};
|
|
742
742
|
|
|
743
|
-
|
|
743
|
+
TEST_F(
|
|
744
|
+
DBFlushTest,
|
|
745
|
+
FixUnrecoverableWriteDuringAtomicFlushWaitUntilFlushWouldNotStallWrites) {
|
|
746
|
+
Options options = CurrentOptions();
|
|
747
|
+
options.atomic_flush = true;
|
|
748
|
+
|
|
749
|
+
// To simulate a real-life crash where we can't flush during db's shutdown
|
|
750
|
+
options.avoid_flush_during_shutdown = true;
|
|
751
|
+
|
|
752
|
+
// Set 3 low thresholds (while `disable_auto_compactions=false`) here so flush
|
|
753
|
+
// adding one more L0 file during `GetLiveFiles()` will have to wait till such
|
|
754
|
+
// flush will not stall writes
|
|
755
|
+
options.level0_stop_writes_trigger = 2;
|
|
756
|
+
options.level0_slowdown_writes_trigger = 2;
|
|
757
|
+
// Disable level-0 compaction triggered by number of files to avoid
|
|
758
|
+
// stalling check being skipped (resulting in the flush mentioned above didn't
|
|
759
|
+
// wait)
|
|
760
|
+
options.level0_file_num_compaction_trigger = -1;
|
|
761
|
+
|
|
762
|
+
CreateAndReopenWithCF({"cf1"}, options);
|
|
763
|
+
|
|
764
|
+
// Manually pause compaction thread to ensure enough L0 files as
|
|
765
|
+
// `disable_auto_compactions=false`is needed, in order to meet the 3 low
|
|
766
|
+
// thresholds above
|
|
767
|
+
std::unique_ptr<test::SleepingBackgroundTask> sleeping_task_;
|
|
768
|
+
sleeping_task_.reset(new test::SleepingBackgroundTask());
|
|
769
|
+
env_->SetBackgroundThreads(1, Env::LOW);
|
|
770
|
+
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
771
|
+
sleeping_task_.get(), Env::Priority::LOW);
|
|
772
|
+
sleeping_task_->WaitUntilSleeping();
|
|
773
|
+
|
|
774
|
+
// Create some initial file to help meet the 3 low thresholds above
|
|
775
|
+
ASSERT_OK(Put(1, "dontcare", "dontcare"));
|
|
776
|
+
ASSERT_OK(Flush(1));
|
|
777
|
+
|
|
778
|
+
// Insert some initial data so we have something to atomic-flush later
|
|
779
|
+
// triggered by `GetLiveFiles()`
|
|
780
|
+
WriteOptions write_opts;
|
|
781
|
+
write_opts.disableWAL = true;
|
|
782
|
+
ASSERT_OK(Put(1, "k1", "v1", write_opts));
|
|
783
|
+
|
|
784
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({{
|
|
785
|
+
"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
|
|
786
|
+
"DBFlushTest::"
|
|
787
|
+
"UnrecoverableWriteInAtomicFlushWaitUntilFlushWouldNotStallWrites::Write",
|
|
788
|
+
}});
|
|
789
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
790
|
+
|
|
791
|
+
// Write to db when atomic flush releases the lock to wait on write stall
|
|
792
|
+
// condition to be gone in `WaitUntilFlushWouldNotStallWrites()`
|
|
793
|
+
port::Thread write_thread([&] {
|
|
794
|
+
TEST_SYNC_POINT(
|
|
795
|
+
"DBFlushTest::"
|
|
796
|
+
"UnrecoverableWriteInAtomicFlushWaitUntilFlushWouldNotStallWrites::"
|
|
797
|
+
"Write");
|
|
798
|
+
// Before the fix, the empty default CF would've been prematurely excluded
|
|
799
|
+
// from this atomic flush. The following two writes together make default CF
|
|
800
|
+
// later contain data that should've been included in the atomic flush.
|
|
801
|
+
ASSERT_OK(Put(0, "k2", "v2", write_opts));
|
|
802
|
+
// The following write increases the max seqno of this atomic flush to be 3,
|
|
803
|
+
// which is greater than the seqno of default CF's data. This then violates
|
|
804
|
+
// the invariant that all entries of seqno less than the max seqno
|
|
805
|
+
// of this atomic flush should've been flushed by the time of this atomic
|
|
806
|
+
// flush finishes.
|
|
807
|
+
ASSERT_OK(Put(1, "k3", "v3", write_opts));
|
|
808
|
+
|
|
809
|
+
// Resume compaction threads and reduce L0 files so `GetLiveFiles()` can
|
|
810
|
+
// resume from the wait
|
|
811
|
+
sleeping_task_->WakeUp();
|
|
812
|
+
sleeping_task_->WaitUntilDone();
|
|
813
|
+
MoveFilesToLevel(1, 1);
|
|
814
|
+
});
|
|
815
|
+
|
|
816
|
+
// Trigger an atomic flush by `GetLiveFiles()`
|
|
817
|
+
std::vector<std::string> files;
|
|
818
|
+
uint64_t manifest_file_size;
|
|
819
|
+
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
|
|
820
|
+
|
|
821
|
+
write_thread.join();
|
|
822
|
+
|
|
823
|
+
ReopenWithColumnFamilies({"default", "cf1"}, options);
|
|
824
|
+
|
|
825
|
+
ASSERT_EQ(Get(1, "k3"), "v3");
|
|
826
|
+
// Prior to the fix, `Get()` will return `NotFound as "k2" entry in default CF
|
|
827
|
+
// can't be recovered from a crash right after the atomic flush finishes,
|
|
828
|
+
// resulting in a "recovery hole" as "k3" can be recovered. It's due to the
|
|
829
|
+
// invariant violation described above.
|
|
830
|
+
ASSERT_EQ(Get(0, "k2"), "v2");
|
|
831
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
832
|
+
}
|
|
833
|
+
|
|
744
834
|
TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) {
|
|
745
835
|
Options options = CurrentOptions();
|
|
746
836
|
options.atomic_flush = true;
|
|
@@ -387,10 +387,8 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
|
|
387
387
|
// We allow flush to stall write since we are trying to resume from error.
|
|
388
388
|
flush_opts.allow_write_stall = true;
|
|
389
389
|
if (immutable_db_options_.atomic_flush) {
|
|
390
|
-
autovector<ColumnFamilyData*> cfds;
|
|
391
|
-
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
392
390
|
mutex_.Unlock();
|
|
393
|
-
s = AtomicFlushMemTables(
|
|
391
|
+
s = AtomicFlushMemTables(flush_opts, context.flush_reason);
|
|
394
392
|
mutex_.Lock();
|
|
395
393
|
} else {
|
|
396
394
|
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
|
@@ -507,11 +505,8 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
|
|
|
507
505
|
has_unpersisted_data_.load(std::memory_order_relaxed) &&
|
|
508
506
|
!mutable_db_options_.avoid_flush_during_shutdown) {
|
|
509
507
|
if (immutable_db_options_.atomic_flush) {
|
|
510
|
-
autovector<ColumnFamilyData*> cfds;
|
|
511
|
-
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
512
508
|
mutex_.Unlock();
|
|
513
|
-
Status s =
|
|
514
|
-
AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
|
|
509
|
+
Status s = AtomicFlushMemTables(FlushOptions(), FlushReason::kShutDown);
|
|
515
510
|
s.PermitUncheckedError(); //**TODO: What to do on error?
|
|
516
511
|
mutex_.Lock();
|
|
517
512
|
} else {
|
|
@@ -5350,12 +5345,10 @@ Status DBImpl::IngestExternalFiles(
|
|
|
5350
5345
|
FlushOptions flush_opts;
|
|
5351
5346
|
flush_opts.allow_write_stall = true;
|
|
5352
5347
|
if (immutable_db_options_.atomic_flush) {
|
|
5353
|
-
autovector<ColumnFamilyData*> cfds_to_flush;
|
|
5354
|
-
SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
|
|
5355
5348
|
mutex_.Unlock();
|
|
5356
|
-
status = AtomicFlushMemTables(
|
|
5357
|
-
|
|
5358
|
-
|
|
5349
|
+
status = AtomicFlushMemTables(
|
|
5350
|
+
flush_opts, FlushReason::kExternalFileIngestion,
|
|
5351
|
+
{} /* provided_candidate_cfds */, true /* entered_write_thread */);
|
|
5359
5352
|
mutex_.Lock();
|
|
5360
5353
|
} else {
|
|
5361
5354
|
for (size_t i = 0; i != num_cfs; ++i) {
|
|
@@ -1081,8 +1081,9 @@ class DBImpl : public DB {
|
|
|
1081
1081
|
// is because in certain cases, we can flush column families, wait for the
|
|
1082
1082
|
// flush to complete, but delete the column family handle before the wait
|
|
1083
1083
|
// finishes. For example in CompactRange.
|
|
1084
|
-
Status TEST_AtomicFlushMemTables(
|
|
1085
|
-
|
|
1084
|
+
Status TEST_AtomicFlushMemTables(
|
|
1085
|
+
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
|
|
1086
|
+
const FlushOptions& flush_opts);
|
|
1086
1087
|
|
|
1087
1088
|
// Wait for background threads to complete scheduled work.
|
|
1088
1089
|
Status TEST_WaitForBackgroundWork();
|
|
@@ -1886,16 +1887,27 @@ class DBImpl : public DB {
|
|
|
1886
1887
|
|
|
1887
1888
|
Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
|
|
1888
1889
|
|
|
1889
|
-
|
|
1890
|
+
// Select and output column families qualified for atomic flush in
|
|
1891
|
+
// `selected_cfds`. If `provided_candidate_cfds` is non-empty, it will be used
|
|
1892
|
+
// as candidate CFs to select qualified ones from. Otherwise, all column
|
|
1893
|
+
// families are used as candidate to select from.
|
|
1894
|
+
//
|
|
1895
|
+
// REQUIRES: mutex held
|
|
1896
|
+
void SelectColumnFamiliesForAtomicFlush(
|
|
1897
|
+
autovector<ColumnFamilyData*>* selected_cfds,
|
|
1898
|
+
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {});
|
|
1890
1899
|
|
|
1891
1900
|
// Force current memtable contents to be flushed.
|
|
1892
1901
|
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
|
|
1893
1902
|
FlushReason flush_reason,
|
|
1894
1903
|
bool entered_write_thread = false);
|
|
1895
1904
|
|
|
1905
|
+
// Atomic-flush memtables from quanlified CFs among `provided_candidate_cfds`
|
|
1906
|
+
// (if non-empty) or amomg all column families and atomically record the
|
|
1907
|
+
// result to the MANIFEST.
|
|
1896
1908
|
Status AtomicFlushMemTables(
|
|
1897
|
-
const autovector<ColumnFamilyData*>& column_family_datas,
|
|
1898
1909
|
const FlushOptions& options, FlushReason flush_reason,
|
|
1910
|
+
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {},
|
|
1899
1911
|
bool entered_write_thread = false);
|
|
1900
1912
|
|
|
1901
1913
|
// Wait until flushing this column family won't stall writes
|
|
@@ -414,7 +414,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
|
|
|
414
414
|
assert(cfd->imm()->NumNotFlushed() != 0);
|
|
415
415
|
assert(cfd->imm()->IsFlushPending());
|
|
416
416
|
}
|
|
417
|
-
for (const auto bg_flush_arg : bg_flush_args) {
|
|
417
|
+
for (const auto& bg_flush_arg : bg_flush_args) {
|
|
418
418
|
assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_);
|
|
419
419
|
}
|
|
420
420
|
#endif /* !NDEBUG */
|
|
@@ -1031,15 +1031,9 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
|
|
|
1031
1031
|
FlushOptions fo;
|
|
1032
1032
|
fo.allow_write_stall = options.allow_write_stall;
|
|
1033
1033
|
if (immutable_db_options_.atomic_flush) {
|
|
1034
|
-
|
|
1035
|
-
mutex_.Lock();
|
|
1036
|
-
SelectColumnFamiliesForAtomicFlush(&cfds);
|
|
1037
|
-
mutex_.Unlock();
|
|
1038
|
-
s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
|
|
1039
|
-
false /* entered_write_thread */);
|
|
1034
|
+
s = AtomicFlushMemTables(fo, FlushReason::kManualCompaction);
|
|
1040
1035
|
} else {
|
|
1041
|
-
s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction
|
|
1042
|
-
false /* entered_write_thread */);
|
|
1036
|
+
s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction);
|
|
1043
1037
|
}
|
|
1044
1038
|
if (!s.ok()) {
|
|
1045
1039
|
LogFlush(immutable_db_options_.info_log);
|
|
@@ -1800,8 +1794,8 @@ Status DBImpl::Flush(const FlushOptions& flush_options,
|
|
|
1800
1794
|
cfh->GetName().c_str());
|
|
1801
1795
|
Status s;
|
|
1802
1796
|
if (immutable_db_options_.atomic_flush) {
|
|
1803
|
-
s = AtomicFlushMemTables(
|
|
1804
|
-
|
|
1797
|
+
s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush,
|
|
1798
|
+
{cfh->cfd()});
|
|
1805
1799
|
} else {
|
|
1806
1800
|
s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
|
|
1807
1801
|
}
|
|
@@ -1839,7 +1833,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options,
|
|
|
1839
1833
|
auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
|
|
1840
1834
|
cfds.emplace_back(cfh->cfd());
|
|
1841
1835
|
});
|
|
1842
|
-
s = AtomicFlushMemTables(
|
|
1836
|
+
s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush, cfds);
|
|
1843
1837
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1844
1838
|
"Manual atomic flush finished, status: %s\n"
|
|
1845
1839
|
"=====Column families:=====",
|
|
@@ -2223,11 +2217,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
|
|
2223
2217
|
return s;
|
|
2224
2218
|
}
|
|
2225
2219
|
|
|
2226
|
-
// Flush all elements in 'column_family_datas'
|
|
2227
|
-
// and atomically record the result to the MANIFEST.
|
|
2228
2220
|
Status DBImpl::AtomicFlushMemTables(
|
|
2229
|
-
const autovector<ColumnFamilyData*>& column_family_datas,
|
|
2230
2221
|
const FlushOptions& flush_options, FlushReason flush_reason,
|
|
2222
|
+
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
|
|
2231
2223
|
bool entered_write_thread) {
|
|
2232
2224
|
assert(immutable_db_options_.atomic_flush);
|
|
2233
2225
|
if (!flush_options.wait && write_controller_.IsStopped()) {
|
|
@@ -2237,18 +2229,48 @@ Status DBImpl::AtomicFlushMemTables(
|
|
|
2237
2229
|
return Status::TryAgain(oss.str());
|
|
2238
2230
|
}
|
|
2239
2231
|
Status s;
|
|
2232
|
+
autovector<ColumnFamilyData*> candidate_cfds;
|
|
2233
|
+
if (provided_candidate_cfds.empty()) {
|
|
2234
|
+
// Generate candidate cfds if not provided
|
|
2235
|
+
{
|
|
2236
|
+
InstrumentedMutexLock l(&mutex_);
|
|
2237
|
+
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
|
2238
|
+
if (!cfd->IsDropped() && cfd->initialized()) {
|
|
2239
|
+
cfd->Ref();
|
|
2240
|
+
candidate_cfds.push_back(cfd);
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
}
|
|
2244
|
+
} else {
|
|
2245
|
+
candidate_cfds = provided_candidate_cfds;
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2240
2248
|
if (!flush_options.allow_write_stall) {
|
|
2241
2249
|
int num_cfs_to_flush = 0;
|
|
2242
|
-
for (auto cfd :
|
|
2250
|
+
for (auto cfd : candidate_cfds) {
|
|
2243
2251
|
bool flush_needed = true;
|
|
2244
2252
|
s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
|
|
2245
2253
|
if (!s.ok()) {
|
|
2254
|
+
// Unref the newly generated candidate cfds (when not provided) in
|
|
2255
|
+
// `candidate_cfds`
|
|
2256
|
+
if (provided_candidate_cfds.empty()) {
|
|
2257
|
+
for (auto candidate_cfd : candidate_cfds) {
|
|
2258
|
+
candidate_cfd->UnrefAndTryDelete();
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2246
2261
|
return s;
|
|
2247
2262
|
} else if (flush_needed) {
|
|
2248
2263
|
++num_cfs_to_flush;
|
|
2249
2264
|
}
|
|
2250
2265
|
}
|
|
2251
2266
|
if (0 == num_cfs_to_flush) {
|
|
2267
|
+
// Unref the newly generated candidate cfds (when not provided) in
|
|
2268
|
+
// `candidate_cfds`
|
|
2269
|
+
if (provided_candidate_cfds.empty()) {
|
|
2270
|
+
for (auto candidate_cfd : candidate_cfds) {
|
|
2271
|
+
candidate_cfd->UnrefAndTryDelete();
|
|
2272
|
+
}
|
|
2273
|
+
}
|
|
2252
2274
|
return s;
|
|
2253
2275
|
}
|
|
2254
2276
|
}
|
|
@@ -2269,15 +2291,16 @@ Status DBImpl::AtomicFlushMemTables(
|
|
|
2269
2291
|
}
|
|
2270
2292
|
WaitForPendingWrites();
|
|
2271
2293
|
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2294
|
+
SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds);
|
|
2295
|
+
|
|
2296
|
+
// Unref the newly generated candidate cfds (when not provided) in
|
|
2297
|
+
// `candidate_cfds`
|
|
2298
|
+
if (provided_candidate_cfds.empty()) {
|
|
2299
|
+
for (auto candidate_cfd : candidate_cfds) {
|
|
2300
|
+
candidate_cfd->UnrefAndTryDelete();
|
|
2279
2301
|
}
|
|
2280
2302
|
}
|
|
2303
|
+
|
|
2281
2304
|
for (auto cfd : cfds) {
|
|
2282
2305
|
if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
|
|
2283
2306
|
flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
|
|
@@ -2908,7 +2931,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
|
|
|
2908
2931
|
// All the CFD/bg_flush_arg in the FlushReq must have the same flush reason, so
|
|
2909
2932
|
// just grab the first one
|
|
2910
2933
|
#ifndef NDEBUG
|
|
2911
|
-
for (const auto bg_flush_arg : bg_flush_args) {
|
|
2934
|
+
for (const auto& bg_flush_arg : bg_flush_args) {
|
|
2912
2935
|
assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_);
|
|
2913
2936
|
}
|
|
2914
2937
|
#endif /* !NDEBUG */
|
|
@@ -155,8 +155,10 @@ Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
|
|
|
155
155
|
}
|
|
156
156
|
|
|
157
157
|
Status DBImpl::TEST_AtomicFlushMemTables(
|
|
158
|
-
const autovector<ColumnFamilyData*>&
|
|
159
|
-
|
|
158
|
+
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
|
|
159
|
+
const FlushOptions& flush_opts) {
|
|
160
|
+
return AtomicFlushMemTables(flush_opts, FlushReason::kTest,
|
|
161
|
+
provided_candidate_cfds);
|
|
160
162
|
}
|
|
161
163
|
|
|
162
164
|
Status DBImpl::TEST_WaitForBackgroundWork() {
|
|
@@ -216,7 +216,7 @@ Status ValidateOptionsByTable(
|
|
|
216
216
|
const DBOptions& db_opts,
|
|
217
217
|
const std::vector<ColumnFamilyDescriptor>& column_families) {
|
|
218
218
|
Status s;
|
|
219
|
-
for (auto cf : column_families) {
|
|
219
|
+
for (auto& cf : column_families) {
|
|
220
220
|
s = ValidateOptions(db_opts, cf.options);
|
|
221
221
|
if (!s.ok()) {
|
|
222
222
|
return s;
|
|
@@ -1211,6 +1211,9 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1211
1211
|
// exceeded at this point so no new write (including current one) will go
|
|
1212
1212
|
// through until memory usage is decreased.
|
|
1213
1213
|
if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
|
|
1214
|
+
default_cf_internal_stats_->AddDBStats(
|
|
1215
|
+
InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, 1,
|
|
1216
|
+
true /* concurrent */);
|
|
1214
1217
|
if (write_options.no_slowdown) {
|
|
1215
1218
|
status = Status::Incomplete("Write stall");
|
|
1216
1219
|
} else {
|
|
@@ -1543,14 +1546,40 @@ Status DBImpl::WriteRecoverableState() {
|
|
|
1543
1546
|
}
|
|
1544
1547
|
|
|
1545
1548
|
void DBImpl::SelectColumnFamiliesForAtomicFlush(
|
|
1546
|
-
autovector<ColumnFamilyData*>*
|
|
1547
|
-
|
|
1549
|
+
autovector<ColumnFamilyData*>* selected_cfds,
|
|
1550
|
+
const autovector<ColumnFamilyData*>& provided_candidate_cfds) {
|
|
1551
|
+
mutex_.AssertHeld();
|
|
1552
|
+
assert(selected_cfds);
|
|
1553
|
+
|
|
1554
|
+
autovector<ColumnFamilyData*> candidate_cfds;
|
|
1555
|
+
|
|
1556
|
+
// Generate candidate cfds if not provided
|
|
1557
|
+
if (provided_candidate_cfds.empty()) {
|
|
1558
|
+
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
|
1559
|
+
if (!cfd->IsDropped() && cfd->initialized()) {
|
|
1560
|
+
cfd->Ref();
|
|
1561
|
+
candidate_cfds.push_back(cfd);
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
} else {
|
|
1565
|
+
candidate_cfds = provided_candidate_cfds;
|
|
1566
|
+
}
|
|
1567
|
+
|
|
1568
|
+
for (ColumnFamilyData* cfd : candidate_cfds) {
|
|
1548
1569
|
if (cfd->IsDropped()) {
|
|
1549
1570
|
continue;
|
|
1550
1571
|
}
|
|
1551
1572
|
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
|
|
1552
1573
|
!cached_recoverable_state_empty_.load()) {
|
|
1553
|
-
|
|
1574
|
+
selected_cfds->push_back(cfd);
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
|
|
1578
|
+
// Unref the newly generated candidate cfds (when not provided) in
|
|
1579
|
+
// `candidate_cfds`
|
|
1580
|
+
if (provided_candidate_cfds.empty()) {
|
|
1581
|
+
for (auto candidate_cfd : candidate_cfds) {
|
|
1582
|
+
candidate_cfd->UnrefAndTryDelete();
|
|
1554
1583
|
}
|
|
1555
1584
|
}
|
|
1556
1585
|
}
|