@nxtedition/rocksdb 7.1.2 → 7.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/binding.cc +26 -0
  2. package/deps/rocksdb/iostats.patch +19 -0
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +15 -1
  4. package/deps/rocksdb/rocksdb/cache/cache_test.cc +93 -58
  5. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -40
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.h +57 -32
  7. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +103 -28
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +33 -1
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +177 -38
  10. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +3 -1
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -2
  12. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +125 -71
  13. package/deps/rocksdb/rocksdb/crash_test.mk +15 -1
  14. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +2 -2
  15. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +1 -1
  16. package/deps/rocksdb/rocksdb/db/blob/blob_log_format.cc +3 -5
  17. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +25 -19
  18. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
  19. package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +36 -0
  20. package/deps/rocksdb/rocksdb/db/column_family.cc +2 -15
  21. package/deps/rocksdb/rocksdb/db/column_family_test.cc +17 -4
  22. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +8 -8
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +0 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +50 -52
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +33 -11
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +41 -10
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +1 -2
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +143 -2
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +43 -18
  31. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +48 -65
  32. package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -0
  33. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +73 -4
  34. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +17 -8
  35. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +71 -2
  36. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +144 -33
  37. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +18 -35
  38. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +11 -5
  39. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +7 -7
  40. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +15 -8
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +2 -1
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +3 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +11 -0
  44. package/deps/rocksdb/rocksdb/db/db_iter.cc +69 -11
  45. package/deps/rocksdb/rocksdb/db/db_iter.h +16 -0
  46. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +2 -1
  47. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +42 -0
  48. package/deps/rocksdb/rocksdb/db/db_test.cc +61 -28
  49. package/deps/rocksdb/rocksdb/db/db_test2.cc +18 -7
  50. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +17 -0
  51. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +61 -0
  52. package/deps/rocksdb/rocksdb/db/db_write_test.cc +130 -0
  53. package/deps/rocksdb/rocksdb/db/experimental.cc +7 -8
  54. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +1 -2
  55. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -7
  56. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -1
  57. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +4 -2
  58. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +1 -1
  59. package/deps/rocksdb/rocksdb/db/log_reader.cc +48 -11
  60. package/deps/rocksdb/rocksdb/db/log_reader.h +8 -2
  61. package/deps/rocksdb/rocksdb/db/log_test.cc +10 -1
  62. package/deps/rocksdb/rocksdb/db/log_writer.cc +7 -1
  63. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +4 -4
  64. package/deps/rocksdb/rocksdb/db/memtable.cc +49 -14
  65. package/deps/rocksdb/rocksdb/db/memtable.h +60 -14
  66. package/deps/rocksdb/rocksdb/db/memtable_list.cc +14 -8
  67. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +30 -10
  68. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +5 -5
  69. package/deps/rocksdb/rocksdb/db/pinned_iterators_manager.h +5 -0
  70. package/deps/rocksdb/rocksdb/db/repair.cc +2 -3
  71. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +3 -7
  72. package/deps/rocksdb/rocksdb/db/table_cache.cc +72 -0
  73. package/deps/rocksdb/rocksdb/db/table_cache.h +19 -1
  74. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +8 -14
  75. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +2 -2
  76. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +35 -64
  77. package/deps/rocksdb/rocksdb/db/version_edit.cc +3 -32
  78. package/deps/rocksdb/rocksdb/db/version_edit.h +2 -12
  79. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +10 -23
  80. package/deps/rocksdb/rocksdb/db/version_set.cc +34 -10
  81. package/deps/rocksdb/rocksdb/db/version_set.h +3 -3
  82. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -6
  83. package/deps/rocksdb/rocksdb/db/version_set_test.cc +17 -15
  84. package/deps/rocksdb/rocksdb/db/wal_manager.cc +0 -4
  85. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +2 -1
  86. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +137 -42
  87. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +21 -0
  88. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +1 -0
  89. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
  90. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +4 -4
  91. package/deps/rocksdb/rocksdb/db/write_thread.cc +51 -46
  92. package/deps/rocksdb/rocksdb/db/write_thread.h +0 -4
  93. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  94. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +6 -0
  95. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +6 -0
  96. package/deps/rocksdb/rocksdb/env/env_posix.cc +1 -1
  97. package/deps/rocksdb/rocksdb/env/env_test.cc +38 -8
  98. package/deps/rocksdb/rocksdb/env/file_system.cc +20 -0
  99. package/deps/rocksdb/rocksdb/env/fs_posix.cc +2 -46
  100. package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -0
  101. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +110 -5
  102. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +7 -0
  103. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +14 -1
  104. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +4 -0
  105. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +1 -1
  106. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +7 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +10 -3
  108. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +3 -1
  109. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +1 -1
  110. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +2 -0
  111. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -0
  112. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +9 -13
  113. package/deps/rocksdb/rocksdb/logging/env_logger.h +39 -13
  114. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +1 -1
  115. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc +1 -1
  116. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +6 -0
  117. package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +4 -1
  118. package/deps/rocksdb/rocksdb/options/cf_options.cc +6 -3
  119. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -5
  120. package/deps/rocksdb/rocksdb/options/options_helper.cc +2 -1
  121. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +1 -0
  122. package/deps/rocksdb/rocksdb/options/options_test.cc +4 -2
  123. package/deps/rocksdb/rocksdb/port/util_logger.h +1 -3
  124. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +50 -8
  125. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -0
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +7 -0
  127. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +28 -10
  128. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +1 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +5 -2
  130. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +1 -0
  131. package/deps/rocksdb/rocksdb/table/get_context.cc +16 -6
  132. package/deps/rocksdb/rocksdb/table/table_reader.h +9 -0
  133. package/deps/rocksdb/rocksdb/table/table_test.cc +2 -1
  134. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +14 -1
  135. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +5 -2
  136. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +7 -8
  137. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +6 -6
  138. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -1
  139. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +2 -0
  140. package/deps/rocksdb/rocksdb/util/stderr_logger.h +13 -0
  141. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +55 -46
  142. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +2 -1
  143. package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +10 -0
  144. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -2
  145. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +2 -2
  146. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  147. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +2 -2
  148. package/index.js +7 -1
  149. package/package.json +1 -1
  150. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  151. package/prebuilds/darwin-x64/node.napi.node +0 -0
  152. package/prebuilds/linux-x64/node.napi.node +0 -0
  153. package/deps/rocksdb/rocksdb/logging/posix_logger.h +0 -179
@@ -151,8 +151,10 @@ class Reader {
151
151
  std::unique_ptr<char[]> uncompressed_buffer_;
152
152
  // Reusable uncompressed record
153
153
  std::string uncompressed_record_;
154
- // Used for stream hashing log record
154
+ // Used for stream hashing fragment content in ReadRecord()
155
155
  XXH3_state_t* hash_state_;
156
+ // Used for stream hashing uncompressed buffer in ReadPhysicalRecord()
157
+ XXH3_state_t* uncompress_hash_state_;
156
158
 
157
159
  // Extend record types with the following special values
158
160
  enum {
@@ -173,7 +175,11 @@ class Reader {
173
175
  };
174
176
 
175
177
  // Return type, or one of the preceding special values
176
- unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
178
+ // If WAL compressioned is enabled, fragment_checksum is the checksum of the
179
+ // fragment computed from the orginal buffer containinng uncompressed
180
+ // fragment.
181
+ unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size,
182
+ uint64_t* fragment_checksum = nullptr);
177
183
 
178
184
  // Read some more
179
185
  bool ReadMore(size_t* drop_size, int *error);
@@ -194,8 +194,17 @@ class LogTest
194
194
  std::string scratch;
195
195
  Slice record;
196
196
  bool ret = false;
197
- ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode);
197
+ uint64_t record_checksum;
198
+ ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode,
199
+ &record_checksum);
198
200
  if (ret) {
201
+ if (!allow_retry_read_) {
202
+ // allow_retry_read_ means using FragmentBufferedReader which does not
203
+ // support record checksum yet.
204
+ uint64_t actual_record_checksum =
205
+ XXH3_64bits(record.data(), record.size());
206
+ assert(actual_record_checksum == record_checksum);
207
+ }
199
208
  return record.ToString();
200
209
  } else {
201
210
  return "EOF";
@@ -13,6 +13,7 @@
13
13
 
14
14
  #include "file/writable_file_writer.h"
15
15
  #include "rocksdb/env.h"
16
+ #include "rocksdb/io_status.h"
16
17
  #include "util/coding.h"
17
18
  #include "util/crc32c.h"
18
19
 
@@ -44,7 +45,12 @@ Writer::~Writer() {
44
45
  }
45
46
  }
46
47
 
47
- IOStatus Writer::WriteBuffer() { return dest_->Flush(); }
48
+ IOStatus Writer::WriteBuffer() {
49
+ if (dest_->seen_error()) {
50
+ return IOStatus::IOError("Seen error. Skip writing buffer.");
51
+ }
52
+ return dest_->Flush();
53
+ }
48
54
 
49
55
  IOStatus Writer::Close() {
50
56
  IOStatus s;
@@ -52,7 +52,7 @@ class ManualCompactionTest : public testing::Test {
52
52
  // Get rid of any state from an old run.
53
53
  dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath(
54
54
  "rocksdb_manual_compaction_test");
55
- DestroyDB(dbname_, Options());
55
+ EXPECT_OK(DestroyDB(dbname_, Options()));
56
56
  }
57
57
 
58
58
  std::string dbname_;
@@ -130,7 +130,7 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
130
130
 
131
131
  delete options.compaction_filter;
132
132
  delete db;
133
- DestroyDB(dbname_, options);
133
+ ASSERT_OK(DestroyDB(dbname_, options));
134
134
  }
135
135
  }
136
136
 
@@ -186,7 +186,7 @@ TEST_F(ManualCompactionTest, Test) {
186
186
 
187
187
  // close database
188
188
  delete db;
189
- DestroyDB(dbname_, Options());
189
+ ASSERT_OK(DestroyDB(dbname_, Options()));
190
190
  }
191
191
 
192
192
  TEST_F(ManualCompactionTest, SkipLevel) {
@@ -298,7 +298,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
298
298
 
299
299
  delete filter;
300
300
  delete db;
301
- DestroyDB(dbname_, options);
301
+ ASSERT_OK(DestroyDB(dbname_, options));
302
302
  }
303
303
 
304
304
  } // anonymous namespace
@@ -21,6 +21,7 @@
21
21
  #include "db/pinned_iterators_manager.h"
22
22
  #include "db/range_tombstone_fragmenter.h"
23
23
  #include "db/read_callback.h"
24
+ #include "db/wide/wide_column_serialization.h"
24
25
  #include "logging/logging.h"
25
26
  #include "memory/arena.h"
26
27
  #include "memory/memory_usage.h"
@@ -445,16 +446,28 @@ InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
445
446
  }
446
447
 
447
448
  FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
448
- const ReadOptions& read_options, SequenceNumber read_seq) {
449
+ const ReadOptions& read_options, SequenceNumber read_seq,
450
+ bool immutable_memtable) {
449
451
  if (read_options.ignore_range_deletions ||
450
452
  is_range_del_table_empty_.load(std::memory_order_relaxed)) {
451
453
  return nullptr;
452
454
  }
453
- return NewRangeTombstoneIteratorInternal(read_options, read_seq);
455
+ return NewRangeTombstoneIteratorInternal(read_options, read_seq,
456
+ immutable_memtable);
454
457
  }
455
458
 
456
459
  FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
457
- const ReadOptions& read_options, SequenceNumber read_seq) {
460
+ const ReadOptions& read_options, SequenceNumber read_seq,
461
+ bool immutable_memtable) {
462
+ if (immutable_memtable) {
463
+ // Note that caller should already have verified that
464
+ // !is_range_del_table_empty_
465
+ assert(IsFragmentedRangeTombstonesConstructed());
466
+ return new FragmentedRangeTombstoneIterator(
467
+ fragmented_range_tombstone_list_.get(), comparator_.comparator,
468
+ read_seq);
469
+ }
470
+
458
471
  auto* unfragmented_iter = new MemTableIterator(
459
472
  *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
460
473
  auto fragmented_tombstone_list =
@@ -467,6 +480,21 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
467
480
  return fragmented_iter;
468
481
  }
469
482
 
483
+ void MemTable::ConstructFragmentedRangeTombstones() {
484
+ assert(!IsFragmentedRangeTombstonesConstructed(false));
485
+ // There should be no concurrent Construction
486
+ if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
487
+ auto* unfragmented_iter =
488
+ new MemTableIterator(*this, ReadOptions(), nullptr /* arena */,
489
+ true /* use_range_del_table */);
490
+
491
+ fragmented_range_tombstone_list_ =
492
+ std::make_unique<FragmentedRangeTombstoneList>(
493
+ std::unique_ptr<InternalIterator>(unfragmented_iter),
494
+ comparator_.comparator);
495
+ }
496
+ }
497
+
470
498
  port::RWMutex* MemTable::GetLock(const Slice& key) {
471
499
  return &locks_[GetSliceRangedNPHash(key, locks_.size())];
472
500
  }
@@ -759,12 +787,6 @@ static bool SaveValue(void* arg, const char* entry) {
759
787
  "Encounter unsupported blob value. Please open DB with "
760
788
  "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
761
789
  }
762
- } else {
763
- assert(type == kTypeWideColumnEntity);
764
-
765
- // TODO: support wide-column entities
766
- *(s->status) =
767
- Status::NotSupported("Encountered unexpected wide-column entity");
768
790
  }
769
791
 
770
792
  if (!s->status->ok()) {
@@ -798,7 +820,17 @@ static bool SaveValue(void* arg, const char* entry) {
798
820
  merge_context->PushOperand(
799
821
  v, s->inplace_update_support == false /* operand_pinned */);
800
822
  } else if (s->value != nullptr) {
801
- s->value->assign(v.data(), v.size());
823
+ if (type != kTypeWideColumnEntity) {
824
+ assert(type == kTypeValue || type == kTypeBlobIndex);
825
+ s->value->assign(v.data(), v.size());
826
+ } else {
827
+ Slice value;
828
+ *(s->status) =
829
+ WideColumnSerialization::GetValueOfDefaultColumn(v, value);
830
+ if (s->status->ok()) {
831
+ s->value->assign(value.data(), value.size());
832
+ }
833
+ }
802
834
  }
803
835
  if (s->inplace_update_support) {
804
836
  s->mem->GetLock(s->key->user_key())->ReadUnlock();
@@ -885,7 +917,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
885
917
  MergeContext* merge_context,
886
918
  SequenceNumber* max_covering_tombstone_seq,
887
919
  SequenceNumber* seq, const ReadOptions& read_opts,
888
- ReadCallback* callback, bool* is_blob_index, bool do_merge) {
920
+ bool immutable_memtable, ReadCallback* callback,
921
+ bool* is_blob_index, bool do_merge) {
889
922
  // The sequence number is updated synchronously in version_set.h
890
923
  if (IsEmpty()) {
891
924
  // Avoiding recording stats for speed.
@@ -895,7 +928,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
895
928
 
896
929
  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
897
930
  NewRangeTombstoneIterator(read_opts,
898
- GetInternalKeySeqno(key.internal_key())));
931
+ GetInternalKeySeqno(key.internal_key()),
932
+ immutable_memtable));
899
933
  if (range_del_iter != nullptr) {
900
934
  *max_covering_tombstone_seq =
901
935
  std::max(*max_covering_tombstone_seq,
@@ -977,7 +1011,7 @@ void MemTable::GetFromTable(const LookupKey& key,
977
1011
  }
978
1012
 
979
1013
  void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
980
- ReadCallback* callback) {
1014
+ ReadCallback* callback, bool immutable_memtable) {
981
1015
  // The sequence number is updated synchronously in version_set.h
982
1016
  if (IsEmpty()) {
983
1017
  // Avoiding recording stats for speed.
@@ -1024,7 +1058,8 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
1024
1058
  if (!no_range_del) {
1025
1059
  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
1026
1060
  NewRangeTombstoneIteratorInternal(
1027
- read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
1061
+ read_options, GetInternalKeySeqno(iter->lkey->internal_key()),
1062
+ immutable_memtable));
1028
1063
  iter->max_covering_tombstone_seq = std::max(
1029
1064
  iter->max_covering_tombstone_seq,
1030
1065
  range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
@@ -202,8 +202,19 @@ class MemTable {
202
202
  // those allocated in arena.
203
203
  InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
204
204
 
205
+ // Returns an iterator that yields the range tombstones of the memtable.
206
+ // The caller must ensure that the underlying MemTable remains live
207
+ // while the returned iterator is live.
208
+ // @param immutable_memtable Whether this memtable is an immutable memtable.
209
+ // This information is not stored in memtable itself, so it needs to be
210
+ // specified by the caller. This flag is used internally to decide whether a
211
+ // cached fragmented range tombstone list can be returned. This cached version
212
+ // is constructed when a memtable becomes immutable. Setting the flag to false
213
+ // will always yield correct result, but may incur performance penalty as it
214
+ // always creates a new fragmented range tombstone list.
205
215
  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
206
- const ReadOptions& read_options, SequenceNumber read_seq);
216
+ const ReadOptions& read_options, SequenceNumber read_seq,
217
+ bool immutable_memtable);
207
218
 
208
219
  Status VerifyEncodedEntry(Slice encoded,
209
220
  const ProtectionInfoKVOS64& kv_prot_info);
@@ -244,35 +255,44 @@ class MemTable {
244
255
  // If do_merge = false then any Merge Operands encountered for key are simply
245
256
  // stored in merge_context.operands_list and never actually merged to get a
246
257
  // final value. The raw Merge Operands are eventually returned to the user.
258
+ // @param immutable_memtable Whether this memtable is immutable. Used
259
+ // internally by NewRangeTombstoneIterator(). See comment above
260
+ // NewRangeTombstoneIterator() for more detail.
247
261
  bool Get(const LookupKey& key, std::string* value, Status* s,
248
262
  MergeContext* merge_context,
249
263
  SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
250
- const ReadOptions& read_opts, ReadCallback* callback = nullptr,
251
- bool* is_blob_index = nullptr, bool do_merge = true) {
264
+ const ReadOptions& read_opts, bool immutable_memtable,
265
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
266
+ bool do_merge = true) {
252
267
  return Get(key, value, /*timestamp=*/nullptr, s, merge_context,
253
- max_covering_tombstone_seq, seq, read_opts, callback,
254
- is_blob_index, do_merge);
268
+ max_covering_tombstone_seq, seq, read_opts, immutable_memtable,
269
+ callback, is_blob_index, do_merge);
255
270
  }
256
271
 
257
272
  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
258
273
  Status* s, MergeContext* merge_context,
259
274
  SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
260
- const ReadOptions& read_opts, ReadCallback* callback = nullptr,
261
- bool* is_blob_index = nullptr, bool do_merge = true);
275
+ const ReadOptions& read_opts, bool immutable_memtable,
276
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
277
+ bool do_merge = true);
262
278
 
263
279
  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
264
280
  Status* s, MergeContext* merge_context,
265
281
  SequenceNumber* max_covering_tombstone_seq,
266
- const ReadOptions& read_opts, ReadCallback* callback = nullptr,
267
- bool* is_blob_index = nullptr, bool do_merge = true) {
282
+ const ReadOptions& read_opts, bool immutable_memtable,
283
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
284
+ bool do_merge = true) {
268
285
  SequenceNumber seq;
269
286
  return Get(key, value, timestamp, s, merge_context,
270
- max_covering_tombstone_seq, &seq, read_opts, callback,
271
- is_blob_index, do_merge);
287
+ max_covering_tombstone_seq, &seq, read_opts, immutable_memtable,
288
+ callback, is_blob_index, do_merge);
272
289
  }
273
290
 
291
+ // @param immutable_memtable Whether this memtable is immutable. Used
292
+ // internally by NewRangeTombstoneIterator(). See comment above
293
+ // NewRangeTombstoneIterator() for more detail.
274
294
  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
275
- ReadCallback* callback);
295
+ ReadCallback* callback, bool immutable_memtable);
276
296
 
277
297
  // If `key` exists in current memtable with type value_type and the existing
278
298
  // value is at least as large as the new value, updates it in-place. Otherwise
@@ -502,6 +522,23 @@ class MemTable {
502
522
  // Returns a heuristic flush decision
503
523
  bool ShouldFlushNow();
504
524
 
525
+ void ConstructFragmentedRangeTombstones();
526
+
527
+ // Returns whether a fragmented range tombstone list is already constructed
528
+ // for this memtable. It should be constructed right before a memtable is
529
+ // added to an immutable memtable list. Note that if a memtable does not have
530
+ // any range tombstone, then no range tombstone list will ever be constructed.
531
+ // @param allow_empty Specifies whether a memtable with no range tombstone is
532
+ // considered to have its fragmented range tombstone list constructed.
533
+ bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const {
534
+ if (allow_empty) {
535
+ return fragmented_range_tombstone_list_.get() != nullptr ||
536
+ is_range_del_table_empty_;
537
+ } else {
538
+ return fragmented_range_tombstone_list_.get() != nullptr;
539
+ }
540
+ }
541
+
505
542
  private:
506
543
  enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
507
544
 
@@ -601,9 +638,18 @@ class MemTable {
601
638
  MergeContext* merge_context, SequenceNumber* seq,
602
639
  bool* found_final_value, bool* merge_in_progress);
603
640
 
604
- // Always returns non-null and assumes certain pre-checks are done
641
+ // Always returns non-null and assumes certain pre-checks (e.g.,
642
+ // is_range_del_table_empty_) are done. This is only valid during the lifetime
643
+ // of the underlying memtable.
605
644
  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
606
- const ReadOptions& read_options, SequenceNumber read_seq);
645
+ const ReadOptions& read_options, SequenceNumber read_seq,
646
+ bool immutable_memtable);
647
+
648
+ // The fragmented range tombstones of this memtable.
649
+ // This is constructed when this memtable becomes immutable
650
+ // if !is_range_del_table_empty_.
651
+ std::unique_ptr<FragmentedRangeTombstoneList>
652
+ fragmented_range_tombstone_list_;
607
653
  };
608
654
 
609
655
  extern const char* EncodeKey(std::string* scratch, const Slice& target);
@@ -119,7 +119,8 @@ void MemTableListVersion::MultiGet(const ReadOptions& read_options,
119
119
  MultiGetRange* range,
120
120
  ReadCallback* callback) {
121
121
  for (auto memtable : memlist_) {
122
- memtable->MultiGet(read_options, range, callback);
122
+ memtable->MultiGet(read_options, range, callback,
123
+ true /* immutable_memtable */);
123
124
  if (range->empty()) {
124
125
  return;
125
126
  }
@@ -130,9 +131,10 @@ bool MemTableListVersion::GetMergeOperands(
130
131
  const LookupKey& key, Status* s, MergeContext* merge_context,
131
132
  SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
132
133
  for (MemTable* memtable : memlist_) {
133
- bool done = memtable->Get(key, /*value*/ nullptr, /*timestamp*/ nullptr, s,
134
- merge_context, max_covering_tombstone_seq,
135
- read_opts, nullptr, nullptr, false);
134
+ bool done =
135
+ memtable->Get(key, /*value*/ nullptr, /*timestamp*/ nullptr, s,
136
+ merge_context, max_covering_tombstone_seq, read_opts,
137
+ true /* immutable_memtable */, nullptr, nullptr, false);
136
138
  if (done) {
137
139
  return true;
138
140
  }
@@ -157,11 +159,13 @@ bool MemTableListVersion::GetFromList(
157
159
  *seq = kMaxSequenceNumber;
158
160
 
159
161
  for (auto& memtable : *list) {
162
+ assert(memtable->IsFragmentedRangeTombstonesConstructed());
160
163
  SequenceNumber current_seq = kMaxSequenceNumber;
161
164
 
162
- bool done = memtable->Get(key, value, timestamp, s, merge_context,
163
- max_covering_tombstone_seq, &current_seq,
164
- read_opts, callback, is_blob_index);
165
+ bool done =
166
+ memtable->Get(key, value, timestamp, s, merge_context,
167
+ max_covering_tombstone_seq, &current_seq, read_opts,
168
+ true /* immutable_memtable */, callback, is_blob_index);
165
169
  if (*seq == kMaxSequenceNumber) {
166
170
  // Store the most recent sequence number of any operation on this key.
167
171
  // Since we only care about the most recent change, we only need to
@@ -194,8 +198,10 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
194
198
  ? read_opts.snapshot->GetSequenceNumber()
195
199
  : kMaxSequenceNumber;
196
200
  for (auto& m : memlist_) {
201
+ assert(m->IsFragmentedRangeTombstonesConstructed());
197
202
  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
198
- m->NewRangeTombstoneIterator(read_opts, read_seq));
203
+ m->NewRangeTombstoneIterator(read_opts, read_seq,
204
+ true /* immutable_memtable */));
199
205
  range_del_agg->AddTombstones(std::move(range_del_iter));
200
206
  }
201
207
  return Status::OK();
@@ -267,22 +267,25 @@ TEST_F(MemTableListTest, GetTest) {
267
267
  // Fetch the newly written keys
268
268
  merge_context.Clear();
269
269
  found = mem->Get(LookupKey("key1", seq), &value,
270
- /*timestamp*/nullptr, &s, &merge_context,
271
- &max_covering_tombstone_seq, ReadOptions());
270
+ /*timestamp*/ nullptr, &s, &merge_context,
271
+ &max_covering_tombstone_seq, ReadOptions(),
272
+ false /* immutable_memtable */);
272
273
  ASSERT_TRUE(s.ok() && found);
273
274
  ASSERT_EQ(value, "value1");
274
275
 
275
276
  merge_context.Clear();
276
277
  found = mem->Get(LookupKey("key1", 2), &value,
277
- /*timestamp*/nullptr, &s, &merge_context,
278
- &max_covering_tombstone_seq, ReadOptions());
278
+ /*timestamp*/ nullptr, &s, &merge_context,
279
+ &max_covering_tombstone_seq, ReadOptions(),
280
+ false /* immutable_memtable */);
279
281
  // MemTable found out that this key is *not* found (at this sequence#)
280
282
  ASSERT_TRUE(found && s.IsNotFound());
281
283
 
282
284
  merge_context.Clear();
283
285
  found = mem->Get(LookupKey("key2", seq), &value,
284
- /*timestamp*/nullptr, &s, &merge_context,
285
- &max_covering_tombstone_seq, ReadOptions());
286
+ /*timestamp*/ nullptr, &s, &merge_context,
287
+ &max_covering_tombstone_seq, ReadOptions(),
288
+ false /* immutable_memtable */);
286
289
  ASSERT_TRUE(s.ok() && found);
287
290
  ASSERT_EQ(value, "value2.2");
288
291
 
@@ -290,6 +293,9 @@ TEST_F(MemTableListTest, GetTest) {
290
293
  ASSERT_EQ(1, mem->num_deletes());
291
294
 
292
295
  // Add memtable to list
296
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
297
+ // in MemTableListVersion::GetFromList work.
298
+ mem->ConstructFragmentedRangeTombstones();
293
299
  list.Add(mem, &to_delete);
294
300
 
295
301
  SequenceNumber saved_seq = seq;
@@ -306,6 +312,9 @@ TEST_F(MemTableListTest, GetTest) {
306
312
  nullptr /* kv_prot_info */));
307
313
 
308
314
  // Add second memtable to list
315
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
316
+ // in MemTableListVersion::GetFromList work.
317
+ mem2->ConstructFragmentedRangeTombstones();
309
318
  list.Add(mem2, &to_delete);
310
319
 
311
320
  // Fetch keys via MemTableList
@@ -388,19 +397,24 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
388
397
  // Fetch the newly written keys
389
398
  merge_context.Clear();
390
399
  found = mem->Get(LookupKey("key1", seq), &value,
391
- /*timestamp*/nullptr, &s, &merge_context,
392
- &max_covering_tombstone_seq, ReadOptions());
400
+ /*timestamp*/ nullptr, &s, &merge_context,
401
+ &max_covering_tombstone_seq, ReadOptions(),
402
+ false /* immutable_memtable */);
393
403
  // MemTable found out that this key is *not* found (at this sequence#)
394
404
  ASSERT_TRUE(found && s.IsNotFound());
395
405
 
396
406
  merge_context.Clear();
397
407
  found = mem->Get(LookupKey("key2", seq), &value,
398
- /*timestamp*/nullptr, &s, &merge_context,
399
- &max_covering_tombstone_seq, ReadOptions());
408
+ /*timestamp*/ nullptr, &s, &merge_context,
409
+ &max_covering_tombstone_seq, ReadOptions(),
410
+ false /* immutable_memtable */);
400
411
  ASSERT_TRUE(s.ok() && found);
401
412
  ASSERT_EQ(value, "value2.2");
402
413
 
403
414
  // Add memtable to list
415
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
416
+ // in MemTableListVersion::GetFromList work.
417
+ mem->ConstructFragmentedRangeTombstones();
404
418
  list.Add(mem, &to_delete);
405
419
  ASSERT_EQ(0, to_delete.size());
406
420
 
@@ -472,6 +486,9 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
472
486
  nullptr /* kv_prot_info */));
473
487
 
474
488
  // Add second memtable to list
489
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
490
+ // in MemTableListVersion::GetFromList work.
491
+ mem2->ConstructFragmentedRangeTombstones();
475
492
  list.Add(mem2, &to_delete);
476
493
  ASSERT_EQ(0, to_delete.size());
477
494
 
@@ -493,6 +510,9 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
493
510
  MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
494
511
  kMaxSequenceNumber, 0 /* column_family_id */);
495
512
  mem3->Ref();
513
+ // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
514
+ // in MemTableListVersion::GetFromList work.
515
+ mem3->ConstructFragmentedRangeTombstones();
496
516
  list.Add(mem3, &to_delete);
497
517
  ASSERT_EQ(1, list.NumNotFlushed());
498
518
  ASSERT_EQ(1, list.NumFlushed());
@@ -69,7 +69,7 @@ std::shared_ptr<DB> OpenDb(bool read_only = false) {
69
69
  class PerfContextTest : public testing::Test {};
70
70
 
71
71
  TEST_F(PerfContextTest, SeekIntoDeletion) {
72
- DestroyDB(kDbName, Options());
72
+ ASSERT_OK(DestroyDB(kDbName, Options()));
73
73
  auto db = OpenDb();
74
74
  WriteOptions write_options;
75
75
  ReadOptions read_options;
@@ -205,7 +205,7 @@ TEST_F(PerfContextTest, StopWatchOverhead) {
205
205
  }
206
206
 
207
207
  void ProfileQueries(bool enabled_time = false) {
208
- DestroyDB(kDbName, Options()); // Start this test with a fresh DB
208
+ ASSERT_OK(DestroyDB(kDbName, Options())); // Start this test with a fresh DB
209
209
 
210
210
  auto db = OpenDb();
211
211
 
@@ -518,7 +518,7 @@ TEST_F(PerfContextTest, KeyComparisonCount) {
518
518
  // starts to become linear to the input size.
519
519
 
520
520
  TEST_F(PerfContextTest, SeekKeyComparison) {
521
- DestroyDB(kDbName, Options());
521
+ ASSERT_OK(DestroyDB(kDbName, Options()));
522
522
  auto db = OpenDb();
523
523
  WriteOptions write_options;
524
524
  ReadOptions read_options;
@@ -652,7 +652,7 @@ TEST_F(PerfContextTest, ToString) {
652
652
  }
653
653
 
654
654
  TEST_F(PerfContextTest, MergeOperatorTime) {
655
- DestroyDB(kDbName, Options());
655
+ ASSERT_OK(DestroyDB(kDbName, Options()));
656
656
  DB* db;
657
657
  Options options;
658
658
  options.create_if_missing = true;
@@ -833,7 +833,7 @@ TEST_F(PerfContextTest, CPUTimer) {
833
833
  return;
834
834
  }
835
835
 
836
- DestroyDB(kDbName, Options());
836
+ ASSERT_OK(DestroyDB(kDbName, Options()));
837
837
  auto db = OpenDb();
838
838
  WriteOptions write_options;
839
839
  ReadOptions read_options;
@@ -25,6 +25,11 @@ class PinnedIteratorsManager : public Cleanable {
25
25
  }
26
26
  }
27
27
 
28
+ // Move constructor and move assignment is allowed.
29
+ PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default;
30
+ PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept =
31
+ default;
32
+
28
33
  // Enable Iterators pinning
29
34
  void StartPinning() {
30
35
  assert(pinning_enabled == false);
@@ -431,8 +431,8 @@ class Repairer {
431
431
  auto write_hint = cfd->CalculateSSTWriteHint(0);
432
432
  std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
433
433
  range_del_iters;
434
- auto range_del_iter =
435
- mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
434
+ auto range_del_iter = mem->NewRangeTombstoneIterator(
435
+ ro, kMaxSequenceNumber, false /* immutable_memtable */);
436
436
  if (range_del_iter != nullptr) {
437
437
  range_del_iters.emplace_back(range_del_iter);
438
438
  }
@@ -651,7 +651,6 @@ class Repairer {
651
651
  table->meta.temperature, table->meta.oldest_blob_file_number,
652
652
  table->meta.oldest_ancester_time, table->meta.file_creation_time,
653
653
  table->meta.file_checksum, table->meta.file_checksum_func_name,
654
- table->meta.min_timestamp, table->meta.max_timestamp,
655
654
  table->meta.unique_id);
656
655
  }
657
656
  assert(next_file_number_ > 0);
@@ -232,13 +232,9 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
232
232
  }
233
233
  ASSERT_OK(Flush());
234
234
  }
235
- // TODO(zjay): all data become cold because of level 5 (penultimate level) is
236
- // the bottommost level, which converts the data to cold. PerKeyPlacement is
237
- // for the last level (level 6). Will be fixed by change the
238
- // bottommost_temperature to the last_level_temperature
239
235
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
240
- ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
241
- ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
236
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
237
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
242
238
 
243
239
  // Compact the files to the last level which should split the hot/cold data
244
240
  MoveFilesToLevel(6);
@@ -249,7 +245,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
249
245
  // the first a few key should be cold
250
246
  AssertKetTemperature(20, Temperature::kCold);
251
247
 
252
- // Wait some time, each it wait, the cold data is increasing and hot data is
248
+ // Wait some time, with each wait, the cold data is increasing and hot data is
253
249
  // decreasing
254
250
  for (int i = 0; i < 30; i++) {
255
251
  dbfull()->TEST_WaitForPeridicWorkerRun(