@nxtedition/rocksdb 7.0.22 → 7.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/binding.cc +9 -4
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
  3. package/deps/rocksdb/rocksdb/Makefile +6 -2
  4. package/deps/rocksdb/rocksdb/TARGETS +14 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
  6. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
  21. package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
  25. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
  26. package/deps/rocksdb/rocksdb/db/c.cc +68 -0
  27. package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
  28. package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
  49. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
  51. package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
  52. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
  55. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
  61. package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
  62. package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
  63. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
  64. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
  65. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  67. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
  68. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
  69. package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
  70. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
  71. package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
  72. package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
  73. package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
  74. package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
  75. package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
  76. package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
  78. package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
  79. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  80. package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
  82. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
  93. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
  94. package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
  95. package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
  96. package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
  97. package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
  98. package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
  99. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
  100. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
  101. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
  102. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
  103. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
  104. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
  106. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
  107. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
  108. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
  111. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
  113. package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
  115. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
  116. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
  117. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
  118. package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
  119. package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
  120. package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
  121. package/deps/rocksdb/rocksdb/prebuilds/linux-x64/node.napi.node +0 -0
  122. package/deps/rocksdb/rocksdb/src.mk +5 -0
  123. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
  124. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
  125. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
  128. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
  129. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
  131. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  132. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
  133. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
  134. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
  135. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
  136. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  137. package/deps/rocksdb/rocksdb/util/compression.h +2 -0
  138. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
  139. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
  140. package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
  141. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
  142. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
  143. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
  144. package/deps/rocksdb/rocksdb.gyp +5 -1
  145. package/package.json +1 -1
  146. package/prebuilds/darwin-arm64/node.napi.node +0 -0
@@ -1907,114 +1907,62 @@ Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
1907
1907
 
1908
1908
  void Version::MultiGetBlob(
1909
1909
  const ReadOptions& read_options, MultiGetRange& range,
1910
- std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs) {
1911
- if (read_options.read_tier == kBlockCacheTier) {
1912
- Status s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
1913
- for (const auto& elem : blob_rqs) {
1914
- for (const auto& blob_rq : elem.second) {
1915
- const KeyContext& key_context = blob_rq.second;
1916
- assert(key_context.s);
1917
- assert(key_context.s->ok());
1918
- *(key_context.s) = s;
1919
- assert(key_context.get_context);
1920
- auto& get_context = *(key_context.get_context);
1921
- get_context.MarkKeyMayExist();
1922
- }
1923
- }
1924
- return;
1925
- }
1910
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs) {
1911
+ assert(!blob_ctxs.empty());
1926
1912
 
1927
- assert(!blob_rqs.empty());
1928
- Status status;
1913
+ autovector<BlobFileReadRequests> blob_reqs;
1929
1914
 
1930
- for (auto& elem : blob_rqs) {
1931
- const uint64_t blob_file_number = elem.first;
1915
+ for (auto& ctx : blob_ctxs) {
1916
+ const auto file_number = ctx.first;
1917
+ const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number);
1932
1918
 
1933
- if (!storage_info_.GetBlobFileMetaData(blob_file_number)) {
1934
- auto& blobs_in_file = elem.second;
1935
- for (const auto& blob : blobs_in_file) {
1936
- const KeyContext& key_context = blob.second;
1937
- *(key_context.s) = Status::Corruption("Invalid blob file number");
1938
- }
1939
- continue;
1940
- }
1919
+ autovector<BlobReadRequest> blob_reqs_in_file;
1920
+ BlobReadContexts& blobs_in_file = ctx.second;
1921
+ for (const auto& blob : blobs_in_file) {
1922
+ const BlobIndex& blob_index = blob.first;
1923
+ const KeyContext& key_context = blob.second;
1941
1924
 
1942
- CacheHandleGuard<BlobFileReader> blob_file_reader;
1943
- assert(blob_source_);
1944
- status =
1945
- blob_source_->GetBlobFileReader(blob_file_number, &blob_file_reader);
1946
- assert(!status.ok() || blob_file_reader.GetValue());
1925
+ if (!blob_file_meta) {
1926
+ *key_context.s = Status::Corruption("Invalid blob file number");
1927
+ continue;
1928
+ }
1947
1929
 
1948
- auto& blobs_in_file = elem.second;
1949
- if (!status.ok()) {
1950
- for (const auto& blob : blobs_in_file) {
1951
- const KeyContext& key_context = blob.second;
1952
- *(key_context.s) = status;
1930
+ if (blob_index.HasTTL() || blob_index.IsInlined()) {
1931
+ *key_context.s =
1932
+ Status::Corruption("Unexpected TTL/inlined blob index");
1933
+ continue;
1953
1934
  }
1954
- continue;
1955
- }
1956
1935
 
1957
- assert(blob_file_reader.GetValue());
1958
- const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
1959
- const CompressionType compression =
1960
- blob_file_reader.GetValue()->GetCompressionType();
1936
+ key_context.value->Reset();
1937
+ blob_reqs_in_file.emplace_back(
1938
+ key_context.ukey_with_ts, blob_index.offset(), blob_index.size(),
1939
+ blob_index.compression(), key_context.value, key_context.s);
1940
+ }
1941
+ if (blob_reqs_in_file.size() > 0) {
1942
+ const auto file_size = blob_file_meta->GetBlobFileSize();
1943
+ blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file);
1944
+ }
1945
+ }
1961
1946
 
1962
- // sort blobs_in_file by file offset.
1963
- std::sort(
1964
- blobs_in_file.begin(), blobs_in_file.end(),
1965
- [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
1966
- assert(lhs.first.file_number() == rhs.first.file_number());
1967
- return lhs.first.offset() < rhs.first.offset();
1968
- });
1947
+ if (blob_reqs.size() > 0) {
1948
+ blob_source_->MultiGetBlob(read_options, blob_reqs, /*bytes_read=*/nullptr);
1949
+ }
1969
1950
 
1970
- autovector<std::reference_wrapper<const KeyContext>> blob_read_key_contexts;
1971
- autovector<std::reference_wrapper<const Slice>> user_keys;
1972
- autovector<uint64_t> offsets;
1973
- autovector<uint64_t> value_sizes;
1974
- autovector<Status*> statuses;
1975
- autovector<PinnableSlice*> values;
1951
+ for (auto& ctx : blob_ctxs) {
1952
+ BlobReadContexts& blobs_in_file = ctx.second;
1976
1953
  for (const auto& blob : blobs_in_file) {
1977
- const auto& blob_index = blob.first;
1978
1954
  const KeyContext& key_context = blob.second;
1979
- if (blob_index.HasTTL() || blob_index.IsInlined()) {
1980
- *(key_context.s) =
1981
- Status::Corruption("Unexpected TTL/inlined blob index");
1982
- continue;
1983
- }
1984
- const uint64_t key_size = key_context.ukey_with_ts.size();
1985
- const uint64_t offset = blob_index.offset();
1986
- const uint64_t value_size = blob_index.size();
1987
- if (!IsValidBlobOffset(offset, key_size, value_size, file_size)) {
1988
- *(key_context.s) = Status::Corruption("Invalid blob offset");
1989
- continue;
1990
- }
1991
- if (blob_index.compression() != compression) {
1992
- *(key_context.s) =
1993
- Status::Corruption("Compression type mismatch when reading a blob");
1994
- continue;
1995
- }
1996
- blob_read_key_contexts.emplace_back(std::cref(key_context));
1997
- user_keys.emplace_back(std::cref(key_context.ukey_with_ts));
1998
- offsets.push_back(blob_index.offset());
1999
- value_sizes.push_back(blob_index.size());
2000
- statuses.push_back(key_context.s);
2001
- values.push_back(key_context.value);
2002
- }
2003
- blob_file_reader.GetValue()->MultiGetBlob(read_options, user_keys, offsets,
2004
- value_sizes, statuses, values,
2005
- /*bytes_read=*/nullptr);
2006
- size_t num = blob_read_key_contexts.size();
2007
- assert(num == user_keys.size());
2008
- assert(num == offsets.size());
2009
- assert(num == value_sizes.size());
2010
- assert(num == statuses.size());
2011
- assert(num == values.size());
2012
- for (size_t i = 0; i < num; ++i) {
2013
- if (statuses[i]->ok()) {
2014
- range.AddValueSize(blob_read_key_contexts[i].get().value->size());
1955
+ if (key_context.s->ok()) {
1956
+ range.AddValueSize(key_context.value->size());
2015
1957
  if (range.GetValueSize() > read_options.value_size_soft_limit) {
2016
- *(blob_read_key_contexts[i].get().s) = Status::Aborted();
1958
+ *key_context.s = Status::Aborted();
2017
1959
  }
1960
+ } else if (key_context.s->IsIncomplete()) {
1961
+ // read_options.read_tier == kBlockCacheTier
1962
+ // Cannot read blob(s): no disk I/O allowed
1963
+ assert(key_context.get_context);
1964
+ auto& get_context = *(key_context.get_context);
1965
+ get_context.MarkKeyMayExist();
2018
1966
  }
2019
1967
  }
2020
1968
  }
@@ -2253,7 +2201,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2253
2201
 
2254
2202
  MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
2255
2203
  // blob_file => [[blob_idx, it], ...]
2256
- std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
2204
+ std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs;
2257
2205
  int prev_level = -1;
2258
2206
 
2259
2207
  while (!fp.IsSearchEnded()) {
@@ -2270,7 +2218,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2270
2218
  // Call MultiGetFromSST for looking up a single file
2271
2219
  s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
2272
2220
  fp.GetHitFileLevel(), fp.IsHitFileLastInLevel(), f,
2273
- blob_rqs, num_filter_read, num_index_read,
2221
+ blob_ctxs, num_filter_read, num_index_read,
2274
2222
  num_sst_read);
2275
2223
  if (fp.GetHitFileLevel() == 0) {
2276
2224
  dump_stats_for_l0_file = true;
@@ -2285,7 +2233,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2285
2233
  while (f != nullptr) {
2286
2234
  mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
2287
2235
  read_options, fp.CurrentFileRange(), fp.GetHitFileLevel(),
2288
- fp.IsHitFileLastInLevel(), f, blob_rqs, num_filter_read,
2236
+ fp.IsHitFileLastInLevel(), f, blob_ctxs, num_filter_read,
2289
2237
  num_index_read, num_sst_read));
2290
2238
  if (fp.KeyMaySpanNextFile()) {
2291
2239
  break;
@@ -2358,8 +2306,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2358
2306
  num_level_read);
2359
2307
  }
2360
2308
 
2361
- if (s.ok() && !blob_rqs.empty()) {
2362
- MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);
2309
+ if (s.ok() && !blob_ctxs.empty()) {
2310
+ MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs);
2363
2311
  }
2364
2312
 
2365
2313
  // Process any left over keys
@@ -2709,6 +2657,16 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
2709
2657
  void VersionStorageInfo::ComputeCompactionScore(
2710
2658
  const ImmutableOptions& immutable_options,
2711
2659
  const MutableCFOptions& mutable_cf_options) {
2660
+ double total_downcompact_bytes = 0.0;
2661
+ // Historically, score is defined as actual bytes in a level divided by
2662
+ // the level's target size, and 1.0 is the threshold for triggering
2663
+ // compaction. Higher score means higher prioritization.
2664
+ // Now we keep the compaction triggering condition, but consider more
2665
+ // factors for priorization, while still keeping the 1.0 threshold.
2666
+ // In order to provide flexibility for reducing score while still
2667
+ // maintaining it to be over 1.0, we scale the original score by 10x
2668
+ // if it is larger than 1.0.
2669
+ const double kScoreScale = 10.0;
2712
2670
  for (int level = 0; level <= MaxInputLevel(); level++) {
2713
2671
  double score;
2714
2672
  if (level == 0) {
@@ -2726,6 +2684,7 @@ void VersionStorageInfo::ComputeCompactionScore(
2726
2684
  int num_sorted_runs = 0;
2727
2685
  uint64_t total_size = 0;
2728
2686
  for (auto* f : files_[level]) {
2687
+ total_downcompact_bytes += static_cast<double>(f->fd.GetFileSize());
2729
2688
  if (!f->being_compacted) {
2730
2689
  total_size += f->compensated_file_size;
2731
2690
  num_sorted_runs++;
@@ -2789,18 +2748,40 @@ void VersionStorageInfo::ComputeCompactionScore(
2789
2748
  }
2790
2749
  score =
2791
2750
  std::max(score, static_cast<double>(total_size) / l0_target_size);
2751
+ if (immutable_options.level_compaction_dynamic_level_bytes &&
2752
+ score > 1.0) {
2753
+ score *= kScoreScale;
2754
+ }
2792
2755
  }
2793
2756
  }
2794
2757
  } else {
2795
2758
  // Compute the ratio of current size to size limit.
2796
2759
  uint64_t level_bytes_no_compacting = 0;
2760
+ uint64_t level_total_bytes = 0;
2797
2761
  for (auto f : files_[level]) {
2762
+ level_total_bytes += f->fd.GetFileSize();
2798
2763
  if (!f->being_compacted) {
2799
2764
  level_bytes_no_compacting += f->compensated_file_size;
2800
2765
  }
2801
2766
  }
2802
- score = static_cast<double>(level_bytes_no_compacting) /
2803
- MaxBytesForLevel(level);
2767
+ if (!immutable_options.level_compaction_dynamic_level_bytes ||
2768
+ level_bytes_no_compacting < MaxBytesForLevel(level)) {
2769
+ score = static_cast<double>(level_bytes_no_compacting) /
2770
+ MaxBytesForLevel(level);
2771
+ } else {
2772
+ // If there are a large mount of data being compacted down to the
2773
+ // current level soon, we would de-prioritize compaction from
2774
+ // a level where the incoming data would be a large ratio. We do
2775
+ // it by dividing level size not by target level size, but
2776
+ // the target size and the incoming compaction bytes.
2777
+ score = static_cast<double>(level_bytes_no_compacting) /
2778
+ (MaxBytesForLevel(level) + total_downcompact_bytes) *
2779
+ kScoreScale;
2780
+ }
2781
+ if (level_total_bytes > MaxBytesForLevel(level)) {
2782
+ total_downcompact_bytes +=
2783
+ static_cast<double>(level_total_bytes - MaxBytesForLevel(level));
2784
+ }
2804
2785
  }
2805
2786
  compaction_level_[level] = level;
2806
2787
  compaction_score_[level] = score;
@@ -3199,6 +3180,15 @@ void SortFileByOverlappingRatio(
3199
3180
 
3200
3181
  std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(),
3201
3182
  [&](const Fsize& f1, const Fsize& f2) -> bool {
3183
+ // If score is the same, pick file with smaller keys.
3184
+ // This makes the algorithm more deterministic, and also
3185
+ // help the trivial move case to have more files to
3186
+ // extend.
3187
+ if (file_to_order[f1.file->fd.GetNumber()] ==
3188
+ file_to_order[f2.file->fd.GetNumber()]) {
3189
+ return icmp.Compare(f1.file->smallest,
3190
+ f2.file->smallest) < 0;
3191
+ }
3202
3192
  return file_to_order[f1.file->fd.GetNumber()] <
3203
3193
  file_to_order[f2.file->fd.GetNumber()];
3204
3194
  });
@@ -3827,13 +3817,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
3827
3817
  // No compaction from L1+ needs to be scheduled.
3828
3818
  base_level_ = num_levels_ - 1;
3829
3819
  } else {
3830
- uint64_t l0_size = 0;
3831
- for (const auto& f : files_[0]) {
3832
- l0_size += f->fd.GetFileSize();
3833
- }
3834
-
3835
- uint64_t base_bytes_max =
3836
- std::max(options.max_bytes_for_level_base, l0_size);
3820
+ uint64_t base_bytes_max = options.max_bytes_for_level_base;
3837
3821
  uint64_t base_bytes_min = static_cast<uint64_t>(
3838
3822
  base_bytes_max / options.max_bytes_for_level_multiplier);
3839
3823
 
@@ -3875,26 +3859,6 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
3875
3859
 
3876
3860
  level_multiplier_ = options.max_bytes_for_level_multiplier;
3877
3861
  assert(base_level_size > 0);
3878
- if (l0_size > base_level_size &&
3879
- (l0_size > options.max_bytes_for_level_base ||
3880
- static_cast<int>(files_[0].size() / 2) >=
3881
- options.level0_file_num_compaction_trigger)) {
3882
- // We adjust the base level according to actual L0 size, and adjust
3883
- // the level multiplier accordingly, when:
3884
- // 1. the L0 size is larger than level size base, or
3885
- // 2. number of L0 files reaches twice the L0->L1 compaction trigger
3886
- // We don't do this otherwise to keep the LSM-tree structure stable
3887
- // unless the L0 compaction is backlogged.
3888
- base_level_size = l0_size;
3889
- if (base_level_ == num_levels_ - 1) {
3890
- level_multiplier_ = 1.0;
3891
- } else {
3892
- level_multiplier_ = std::pow(
3893
- static_cast<double>(max_level_size) /
3894
- static_cast<double>(base_level_size),
3895
- 1.0 / static_cast<double>(num_levels_ - base_level_ - 1));
3896
- }
3897
- }
3898
3862
 
3899
3863
  uint64_t level_size = base_level_size;
3900
3864
  for (int i = base_level_; i < num_levels_; i++) {
@@ -860,11 +860,11 @@ class Version {
860
860
  FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
861
861
  uint64_t* bytes_read) const;
862
862
 
863
- using BlobReadRequest =
863
+ using BlobReadContext =
864
864
  std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>;
865
- using BlobReadRequests = std::vector<BlobReadRequest>;
865
+ using BlobReadContexts = std::vector<BlobReadContext>;
866
866
  void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
867
- std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs);
867
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs);
868
868
 
869
869
  // Loads some stats information from files (if update_stats is set) and
870
870
  // populates derived data structures. Call without mutex held. It needs to be
@@ -989,7 +989,7 @@ class Version {
989
989
  /* ret_type */ Status, /* func_name */ MultiGetFromSST,
990
990
  const ReadOptions& read_options, MultiGetRange file_range,
991
991
  int hit_file_level, bool is_hit_file_last_in_level, FdWithKeyRange* f,
992
- std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
992
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
993
993
  uint64_t& num_filter_read, uint64_t& num_index_read,
994
994
  uint64_t& num_sst_read);
995
995
 
@@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE {
14
14
  DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
15
15
  (const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
16
16
  bool is_hit_file_last_in_level, FdWithKeyRange* f,
17
- std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
17
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
18
18
  uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_sst_read) {
19
19
  bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
20
20
  get_perf_context()->per_level_perf_context_enabled;
@@ -110,7 +110,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
110
110
  Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
111
111
  if (tmp_s.ok()) {
112
112
  const uint64_t blob_file_num = blob_index.file_number();
113
- blob_rqs[blob_file_num].emplace_back(
113
+ blob_ctxs[blob_file_num].emplace_back(
114
114
  std::make_pair(blob_index, std::cref(*iter)));
115
115
  } else {
116
116
  *(iter->s) = tmp_s;
@@ -376,73 +376,80 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
376
376
  ASSERT_EQ(2, vstorage_.base_level());
377
377
  // level multiplier should be 3.5
378
378
  ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
379
- // Level size should be around 30,000, 105,000, 367,500
380
379
  ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
381
380
  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
382
381
  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
382
+
383
+ vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
384
+ // Only L0 hits compaction.
385
+ ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
383
386
  }
384
387
 
385
388
  TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
386
389
  ioptions_.level_compaction_dynamic_level_bytes = true;
387
390
  mutable_cf_options_.max_bytes_for_level_base = 10000;
388
391
  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
389
- mutable_cf_options_.level0_file_num_compaction_trigger = 2;
392
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
390
393
 
391
394
  Add(0, 11U, "1", "2", 10000U);
392
395
  Add(0, 12U, "1", "2", 10000U);
393
396
  Add(0, 13U, "1", "2", 10000U);
394
397
 
398
+ // Level size should be around 10,000, 10,290, 51,450, 257,250
395
399
  Add(5, 4U, "1", "2", 1286250U);
396
- Add(4, 5U, "1", "2", 200000U);
397
- Add(3, 6U, "1", "2", 40000U);
398
- Add(2, 7U, "1", "2", 8000U);
400
+ Add(4, 5U, "1", "2", 258000U); // unadjusted score 1.003
401
+ Add(3, 6U, "1", "2", 53000U); // unadjusted score 1.03
402
+ Add(2, 7U, "1", "2", 20000U); // unadjusted score 1.94
399
403
 
400
404
  UpdateVersionStorageInfo();
401
405
 
402
406
  ASSERT_EQ(0, logger_->log_count);
403
- ASSERT_EQ(2, vstorage_.base_level());
404
- // level multiplier should be 3.5
405
- ASSERT_LT(vstorage_.level_multiplier(), 3.6);
406
- ASSERT_GT(vstorage_.level_multiplier(), 3.4);
407
- // Level size should be around 30,000, 105,000, 367,500
408
- ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
409
- ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
410
- ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
411
- ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
412
- ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
407
+ ASSERT_EQ(1, vstorage_.base_level());
408
+ ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1));
409
+ ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2));
410
+ ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
411
+ ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
412
+
413
+ vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
414
+ // Although L2 and l3 have higher unadjusted compaction score, considering
415
+ // a relatively large L0 being compacted down soon, L4 is picked up for
416
+ // compaction.
417
+ // L0 is still picked up for oversizing.
418
+ ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0));
419
+ ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1));
413
420
  }
414
421
 
415
422
  TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
416
423
  ioptions_.level_compaction_dynamic_level_bytes = true;
417
- mutable_cf_options_.max_bytes_for_level_base = 10000;
424
+ mutable_cf_options_.max_bytes_for_level_base = 20000;
418
425
  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
419
- mutable_cf_options_.level0_file_num_compaction_trigger = 2;
426
+ mutable_cf_options_.level0_file_num_compaction_trigger = 5;
420
427
 
421
- Add(0, 11U, "1", "2", 5000U);
422
- Add(0, 12U, "1", "2", 5000U);
423
- Add(0, 13U, "1", "2", 5000U);
424
- Add(0, 14U, "1", "2", 5000U);
425
- Add(0, 15U, "1", "2", 5000U);
426
- Add(0, 16U, "1", "2", 5000U);
428
+ Add(0, 11U, "1", "2", 2500U);
429
+ Add(0, 12U, "1", "2", 2500U);
430
+ Add(0, 13U, "1", "2", 2500U);
431
+ Add(0, 14U, "1", "2", 2500U);
427
432
 
433
+ // Level size should be around 20,000, 53000, 258000
428
434
  Add(5, 4U, "1", "2", 1286250U);
429
- Add(4, 5U, "1", "2", 200000U);
430
- Add(3, 6U, "1", "2", 40000U);
431
- Add(2, 7U, "1", "2", 8000U);
435
+ Add(4, 5U, "1", "2", 260000U); // Unadjusted score 1.01, adjusted about 4.3
436
+ Add(3, 6U, "1", "2", 85000U); // Unadjusted score 1.42, adjusted about 11.6
437
+ Add(2, 7U, "1", "2", 30000); // Unadjusted score 1.5, adjusted about 10.0
432
438
 
433
439
  UpdateVersionStorageInfo();
434
440
 
435
441
  ASSERT_EQ(0, logger_->log_count);
436
442
  ASSERT_EQ(2, vstorage_.base_level());
437
- // level multiplier should be 3.5
438
- ASSERT_LT(vstorage_.level_multiplier(), 3.6);
439
- ASSERT_GT(vstorage_.level_multiplier(), 3.4);
440
- // Level size should be around 30,000, 105,000, 367,500
441
- ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
442
- ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
443
- ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
444
- ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
445
- ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
443
+ ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
444
+
445
+ vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
446
+ // Although L2 has higher unadjusted compaction score, considering
447
+ // a relatively large L0 being compacted down soon, L3 is picked up for
448
+ // compaction.
449
+
450
+ ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0));
451
+ ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1));
452
+ ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2));
446
453
  }
447
454
 
448
455
  TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
@@ -3063,7 +3063,8 @@ size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
3063
3063
  }
3064
3064
 
3065
3065
  Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
3066
- size_t bytes_per_key) {
3066
+ size_t bytes_per_key,
3067
+ uint64_t* checksum) {
3067
3068
  if (bytes_per_key == 0) {
3068
3069
  if (wb->prot_info_ != nullptr) {
3069
3070
  wb->prot_info_.reset();
@@ -3076,7 +3077,14 @@ Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
3076
3077
  if (wb->prot_info_ == nullptr) {
3077
3078
  wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
3078
3079
  ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
3079
- return wb->Iterate(&prot_info_updater);
3080
+ Status s = wb->Iterate(&prot_info_updater);
3081
+ if (s.ok() && checksum != nullptr) {
3082
+ uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size());
3083
+ if (expected_hash != *checksum) {
3084
+ return Status::Corruption("Write batch content corrupted.");
3085
+ }
3086
+ }
3087
+ return s;
3080
3088
  } else {
3081
3089
  // Already protected.
3082
3090
  return Status::OK();
@@ -240,7 +240,10 @@ class WriteBatchInternal {
240
240
  return wb.has_key_with_ts_;
241
241
  }
242
242
 
243
- static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key);
243
+ // Update per-key value protection information on this write batch.
244
+ // If checksum is provided, the batch content is verfied against the checksum.
245
+ static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key,
246
+ uint64_t* checksum = nullptr);
244
247
  };
245
248
 
246
249
  // LocalSavePoint is similar to a scope guard
@@ -148,7 +148,7 @@ void DbVerificationThread(void* v) {
148
148
  }
149
149
  }
150
150
 
151
- void SnapshotGcThread(void* v) {
151
+ void TimestampedSnapshotsThread(void* v) {
152
152
  assert(FLAGS_create_timestamped_snapshot_one_in > 0);
153
153
  auto* thread = reinterpret_cast<ThreadState*>(v);
154
154
  assert(thread);
@@ -169,6 +169,14 @@ void SnapshotGcThread(void* v) {
169
169
  }
170
170
 
171
171
  uint64_t now = db_stress_env->NowNanos();
172
+ std::pair<Status, std::shared_ptr<const Snapshot>> res =
173
+ stress_test->CreateTimestampedSnapshot(now);
174
+ if (res.first.ok()) {
175
+ assert(res.second);
176
+ assert(res.second->GetTimestamp() == now);
177
+ } else {
178
+ assert(!res.second);
179
+ }
172
180
  constexpr uint64_t time_diff = static_cast<uint64_t>(1000) * 1000 * 1000;
173
181
  stress_test->ReleaseOldTimestampedSnapshots(now - time_diff);
174
182
 
@@ -267,15 +275,13 @@ uint32_t GetValueBase(Slice s) {
267
275
  return res;
268
276
  }
269
277
 
270
- std::string NowNanosStr() {
278
+ std::string GetNowNanos() {
271
279
  uint64_t t = db_stress_env->NowNanos();
272
280
  std::string ret;
273
281
  PutFixed64(&ret, t);
274
282
  return ret;
275
283
  }
276
284
 
277
- std::string GenerateTimestampForRead() { return NowNanosStr(); }
278
-
279
285
  namespace {
280
286
 
281
287
  class MyXXH64Checksum : public FileChecksumGenerator {
@@ -110,6 +110,7 @@ DECLARE_int32(open_files);
110
110
  DECLARE_int64(compressed_cache_size);
111
111
  DECLARE_int32(compressed_cache_numshardbits);
112
112
  DECLARE_int32(compaction_style);
113
+ DECLARE_int32(compaction_pri);
113
114
  DECLARE_int32(num_levels);
114
115
  DECLARE_int32(level0_file_num_compaction_trigger);
115
116
  DECLARE_int32(level0_slowdown_writes_trigger);
@@ -592,7 +593,7 @@ extern void PoolSizeChangeThread(void* v);
592
593
 
593
594
  extern void DbVerificationThread(void* v);
594
595
 
595
- extern void SnapshotGcThread(void* v);
596
+ extern void TimestampedSnapshotsThread(void* v);
596
597
 
597
598
  extern void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz);
598
599
 
@@ -612,8 +613,7 @@ extern void CheckAndSetOptionsForMultiOpsTxnStressTest();
612
613
  extern void InitializeHotKeyGenerator(double alpha);
613
614
  extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key);
614
615
 
615
- extern std::string GenerateTimestampForRead();
616
- extern std::string NowNanosStr();
616
+ extern std::string GetNowNanos();
617
617
 
618
618
  std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
619
619
  const std::string& name);
@@ -105,9 +105,10 @@ bool RunStressTest(StressTest* stress) {
105
105
  &continuous_verification_thread);
106
106
  }
107
107
 
108
- ThreadState snapshots_gc_thread(0, &shared);
108
+ ThreadState timestamped_snapshots_thread(0, &shared);
109
109
  if (FLAGS_create_timestamped_snapshot_one_in > 0) {
110
- db_stress_env->StartThread(SnapshotGcThread, &snapshots_gc_thread);
110
+ db_stress_env->StartThread(TimestampedSnapshotsThread,
111
+ &timestamped_snapshots_thread);
111
112
  }
112
113
 
113
114
  // Each thread goes through the following states:
@@ -200,6 +200,10 @@ DEFINE_int32(
200
200
  DEFINE_int32(compaction_style, ROCKSDB_NAMESPACE::Options().compaction_style,
201
201
  "");
202
202
 
203
+ DEFINE_int32(compaction_pri, ROCKSDB_NAMESPACE::Options().compaction_pri,
204
+ "Which file from a level should be picked to merge to the next "
205
+ "level in level-based compaction");
206
+
203
207
  DEFINE_int32(num_levels, ROCKSDB_NAMESPACE::Options().num_levels,
204
208
  "Number of levels in the DB");
205
209
 
@@ -75,7 +75,8 @@ class SharedState {
75
75
  should_stop_test_(false),
76
76
  no_overwrite_ids_(GenerateNoOverwriteIds()),
77
77
  expected_state_manager_(nullptr),
78
- printing_verification_results_(false) {
78
+ printing_verification_results_(false),
79
+ start_timestamp_(Env::Default()->NowNanos()) {
79
80
  Status status;
80
81
  // TODO: We should introduce a way to explicitly disable verification
81
82
  // during shutdown. When that is disabled and FLAGS_expected_values_dir
@@ -303,6 +304,8 @@ class SharedState {
303
304
  printing_verification_results_.store(false, std::memory_order_relaxed);
304
305
  }
305
306
 
307
+ uint64_t GetStartTimestamp() const { return start_timestamp_; }
308
+
306
309
  private:
307
310
  static void IgnoreReadErrorCallback(void*) {
308
311
  ignore_read_error = true;
@@ -365,6 +368,7 @@ class SharedState {
365
368
  // and storing it in the container may require copying depending on the impl.
366
369
  std::vector<std::unique_ptr<port::Mutex[]>> key_locks_;
367
370
  std::atomic<bool> printing_verification_results_;
371
+ const uint64_t start_timestamp_;
368
372
  };
369
373
 
370
374
  // Per-thread state for concurrent executions of the same benchmark.