@nxtedition/rocksdb 11.0.3 → 11.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/binding.cc +147 -125
  2. package/deps/rocksdb/rocksdb/db/column_family_test.cc +15 -7
  3. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
  4. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -4
  5. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +11 -7
  6. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +17 -11
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +15 -0
  8. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +155 -0
  9. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +564 -461
  10. package/deps/rocksdb/rocksdb/db/db_follower_test.cc +8 -4
  11. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +40 -24
  12. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +8 -1
  13. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +7 -4
  14. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  15. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -1
  16. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +19 -1
  17. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +20 -16
  18. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +27 -0
  19. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +10 -2
  20. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +85 -0
  21. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +55 -2
  22. package/deps/rocksdb/rocksdb/db/db_test2.cc +231 -0
  23. package/deps/rocksdb/rocksdb/db/db_test_util.cc +5 -0
  24. package/deps/rocksdb/rocksdb/db/db_test_util.h +10 -1
  25. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +0 -1
  26. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +175 -1
  27. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +64 -0
  28. package/deps/rocksdb/rocksdb/db/dbformat.h +5 -6
  29. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +8 -8
  30. package/deps/rocksdb/rocksdb/db/experimental.cc +3 -2
  31. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +2 -4
  32. package/deps/rocksdb/rocksdb/db/flush_job.cc +7 -2
  33. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +4 -2
  34. package/deps/rocksdb/rocksdb/db/listener_test.cc +5 -5
  35. package/deps/rocksdb/rocksdb/db/log_writer.cc +12 -3
  36. package/deps/rocksdb/rocksdb/db/memtable.cc +83 -23
  37. package/deps/rocksdb/rocksdb/db/memtable.h +11 -3
  38. package/deps/rocksdb/rocksdb/db/memtable_list.cc +7 -5
  39. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +21 -0
  40. package/deps/rocksdb/rocksdb/db/version_builder.cc +462 -33
  41. package/deps/rocksdb/rocksdb/db/version_builder.h +70 -23
  42. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +95 -207
  43. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +54 -35
  44. package/deps/rocksdb/rocksdb/db/version_set.cc +13 -11
  45. package/deps/rocksdb/rocksdb/db/version_set_test.cc +313 -59
  46. package/deps/rocksdb/rocksdb/db/write_batch.cc +124 -64
  47. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -3
  48. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +1 -1
  49. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +4 -1
  50. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
  51. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +4 -32
  52. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -3
  53. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +60 -172
  54. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +57 -2
  55. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +23 -15
  56. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +2 -3
  57. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.cc +1 -1
  58. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +4 -1
  59. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +200 -92
  60. package/deps/rocksdb/rocksdb/env/file_system.cc +3 -3
  61. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +124 -23
  62. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +61 -8
  63. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +141 -2
  64. package/deps/rocksdb/rocksdb/file/file_util.cc +17 -2
  65. package/deps/rocksdb/rocksdb/file/file_util.h +10 -0
  66. package/deps/rocksdb/rocksdb/file/filename.cc +11 -3
  67. package/deps/rocksdb/rocksdb/file/filename.h +2 -1
  68. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +18 -0
  69. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +27 -4
  70. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +8 -1
  71. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +8 -13
  72. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -0
  73. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -2
  75. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +2 -1
  76. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +34 -0
  77. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -1
  78. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  79. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +27 -9
  80. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +2 -0
  81. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +12 -0
  82. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  83. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  84. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +29 -1
  85. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +102 -33
  86. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +46 -3
  87. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +4 -0
  88. package/deps/rocksdb/rocksdb/options/cf_options.cc +6 -0
  89. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  90. package/deps/rocksdb/rocksdb/options/db_options.cc +15 -1
  91. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  92. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -0
  93. package/deps/rocksdb/rocksdb/options/options_parser.cc +3 -2
  94. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -2
  95. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +75 -35
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -0
  97. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +4 -0
  98. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +8 -1
  99. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +40 -15
  100. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +98 -17
  101. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +14 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +21 -91
  103. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +13 -21
  104. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +14 -5
  105. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +62 -53
  106. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +60 -38
  107. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +175 -78
  108. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +65 -36
  109. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +25 -15
  110. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +13 -1
  111. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +18 -4
  112. package/deps/rocksdb/rocksdb/table/meta_blocks.h +4 -0
  113. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -0
  114. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +2 -2
  115. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +47 -18
  116. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +1 -2
  117. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +95 -0
  118. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +26 -15
  119. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +62 -19
  120. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +73 -34
  121. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -0
  122. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +10 -3
  123. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +2 -1
  124. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +8 -5
  125. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +7 -4
  126. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +225 -0
  127. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +2 -1
  128. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +17 -0
  129. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +5 -2
  130. package/index.js +5 -17
  131. package/iterator.js +1 -1
  132. package/package.json +1 -1
  133. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  134. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -23,17 +23,17 @@ static std::string IKey(const std::string& user_key, uint64_t seq,
23
23
  }
24
24
 
25
25
  static std::string Shorten(const std::string& s, const std::string& l) {
26
- std::string result = s;
27
- ShortenedIndexBuilder::FindShortestInternalKeySeparator(*BytewiseComparator(),
28
- &result, l);
29
- return result;
26
+ std::string scratch;
27
+ return ShortenedIndexBuilder::FindShortestInternalKeySeparator(
28
+ *BytewiseComparator(), s, l, &scratch)
29
+ .ToString();
30
30
  }
31
31
 
32
32
  static std::string ShortSuccessor(const std::string& s) {
33
- std::string result = s;
34
- ShortenedIndexBuilder::FindShortInternalKeySuccessor(*BytewiseComparator(),
35
- &result);
36
- return result;
33
+ std::string scratch;
34
+ return ShortenedIndexBuilder::FindShortInternalKeySuccessor(
35
+ *BytewiseComparator(), s, &scratch)
36
+ .ToString();
37
37
  }
38
38
 
39
39
  static void TestKey(const std::string& key, uint64_t seq, ValueType vt) {
@@ -711,7 +711,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
711
711
  uint64_t /*file_size*/) override {
712
712
  // FIXME later: `key` might contain user timestamp. That should be
713
713
  // exposed properly in a future update to TablePropertiesCollector
714
- KeySegmentsExtractor::Result extracted;
714
+ extracted.Reset();
715
715
  if (extractor) {
716
716
  extractor->Extract(key, KeySegmentsExtractor::kFullUserKey, &extracted);
717
717
  if (UNLIKELY(extracted.category >=
@@ -750,7 +750,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
750
750
  }
751
751
  }
752
752
  prev_key.assign(key.data(), key.size());
753
- prev_extracted = std::move(extracted);
753
+ std::swap(prev_extracted, extracted);
754
754
  first_key = false;
755
755
  return Status::OK();
756
756
  }
@@ -859,6 +859,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
859
859
  std::vector<std::shared_ptr<SstQueryFilterBuilder>> builders;
860
860
  bool first_key = true;
861
861
  std::string prev_key;
862
+ KeySegmentsExtractor::Result extracted;
862
863
  KeySegmentsExtractor::Result prev_extracted;
863
864
  KeySegmentsExtractor::KeyCategorySet categories_seen;
864
865
  };
@@ -674,10 +674,8 @@ class SstFileWriterCollector : public TablePropertiesCollector {
674
674
 
675
675
  Status Finish(UserCollectedProperties* properties) override {
676
676
  std::string count = std::to_string(count_);
677
- *properties = UserCollectedProperties{
678
- {prefix_ + "_SstFileWriterCollector", "YES"},
679
- {prefix_ + "_Count", count},
680
- };
677
+ properties->insert({prefix_ + "_SstFileWriterCollector", "YES"});
678
+ properties->insert({prefix_ + "_Count", count});
681
679
  return Status::OK();
682
680
  }
683
681
 
@@ -235,7 +235,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
235
235
 
236
236
  AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
237
237
  if (mems_.empty()) {
238
- ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
238
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] No memtable to flush",
239
239
  cfd_->GetName().c_str());
240
240
  return Status::OK();
241
241
  }
@@ -1017,10 +1017,15 @@ Status FlushJob::WriteLevel0Table() {
1017
1017
  ROCKS_LOG_BUFFER(log_buffer_,
1018
1018
  "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
1019
1019
  " bytes %s"
1020
- "%s",
1020
+ " %s"
1021
+ " %s",
1021
1022
  cfd_->GetName().c_str(), job_context_->job_id,
1022
1023
  meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
1023
1024
  s.ToString().c_str(),
1025
+ s.ok() && meta_.fd.GetFileSize() == 0
1026
+ ? "It's an empty SST file from a successful flush so "
1027
+ "won't be kept in the DB"
1028
+ : "",
1024
1029
  meta_.marked_for_compaction ? " (needs compaction)" : "");
1025
1030
 
1026
1031
  if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
@@ -68,7 +68,8 @@ class FlushJobTestBase : public testing::Test {
68
68
  }
69
69
 
70
70
  void NewDB() {
71
- ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
71
+ ASSERT_OK(
72
+ SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
72
73
  VersionEdit new_db;
73
74
 
74
75
  new_db.SetLogNumber(0);
@@ -114,7 +115,8 @@ class FlushJobTestBase : public testing::Test {
114
115
  }
115
116
  ASSERT_OK(s);
116
117
  // Make "CURRENT" file that points to the new manifest file.
117
- s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
118
+ s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
119
+ Temperature::kUnknown, nullptr);
118
120
  ASSERT_OK(s);
119
121
  }
120
122
 
@@ -354,13 +354,13 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
354
354
  }
355
355
 
356
356
  TEST_F(EventListenerTest, MultiCF) {
357
- Options options;
358
- options.env = CurrentOptions().env;
359
- options.write_buffer_size = k110KB;
357
+ for (auto atomic_flush : {false, true}) {
358
+ Options options;
359
+ options.env = CurrentOptions().env;
360
+ options.write_buffer_size = k110KB;
360
361
  #ifdef ROCKSDB_USING_THREAD_STATUS
361
- options.enable_thread_tracking = true;
362
+ options.enable_thread_tracking = true;
362
363
  #endif // ROCKSDB_USING_THREAD_STATUS
363
- for (auto atomic_flush : {false, true}) {
364
364
  options.atomic_flush = atomic_flush;
365
365
  options.create_if_missing = true;
366
366
  DestroyAndReopen(options);
@@ -55,7 +55,10 @@ IOStatus Writer::WriteBuffer(const WriteOptions& write_options) {
55
55
  if (dest_->seen_error()) {
56
56
  #ifndef NDEBUG
57
57
  if (dest_->seen_injected_error()) {
58
- return IOStatus::IOError("Seen injected error. Skip writing buffer.");
58
+ std::stringstream msg;
59
+ msg << "Seen " << FaultInjectionTestFS::kInjected
60
+ << " error. Skip writing buffer.";
61
+ return IOStatus::IOError(msg.str());
59
62
  }
60
63
  #endif // NDEBUG
61
64
  return IOStatus::IOError("Seen error. Skip writing buffer.");
@@ -93,7 +96,10 @@ IOStatus Writer::AddRecord(const WriteOptions& write_options,
93
96
  if (dest_->seen_error()) {
94
97
  #ifndef NDEBUG
95
98
  if (dest_->seen_injected_error()) {
96
- return IOStatus::IOError("Seen injected error. Skip writing buffer.");
99
+ std::stringstream msg;
100
+ msg << "Seen " << FaultInjectionTestFS::kInjected
101
+ << " error. Skip writing buffer.";
102
+ return IOStatus::IOError(msg.str());
97
103
  }
98
104
  #endif // NDEBUG
99
105
  return IOStatus::IOError("Seen error. Skip writing buffer.");
@@ -205,7 +211,10 @@ IOStatus Writer::AddCompressionTypeRecord(const WriteOptions& write_options) {
205
211
  if (dest_->seen_error()) {
206
212
  #ifndef NDEBUG
207
213
  if (dest_->seen_injected_error()) {
208
- return IOStatus::IOError("Seen injected error. Skip writing buffer.");
214
+ std::stringstream msg;
215
+ msg << "Seen " << FaultInjectionTestFS::kInjected
216
+ << " error. Skip writing buffer.";
217
+ return IOStatus::IOError(msg.str());
209
218
  }
210
219
  #endif // NDEBUG
211
220
  return IOStatus::IOError("Seen error. Skip writing buffer.");
@@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
67
67
  statistics(ioptions.stats),
68
68
  merge_operator(ioptions.merge_operator.get()),
69
69
  info_log(ioptions.logger),
70
- allow_data_in_errors(ioptions.allow_data_in_errors),
71
70
  protection_bytes_per_key(
72
- mutable_cf_options.memtable_protection_bytes_per_key) {}
71
+ mutable_cf_options.memtable_protection_bytes_per_key),
72
+ allow_data_in_errors(ioptions.allow_data_in_errors),
73
+ paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
73
74
 
74
75
  MemTable::MemTable(const InternalKeyComparator& cmp,
75
76
  const ImmutableOptions& ioptions,
@@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator {
370
371
  : bloom_(nullptr),
371
372
  prefix_extractor_(mem.prefix_extractor_),
372
373
  comparator_(mem.comparator_),
373
- valid_(false),
374
374
  seqno_to_time_mapping_(seqno_to_time_mapping),
375
- arena_mode_(arena != nullptr),
376
- value_pinned_(
377
- !mem.GetImmutableMemTableOptions()->inplace_update_support),
378
- protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
379
375
  status_(Status::OK()),
380
376
  logger_(mem.moptions_.info_log),
381
- ts_sz_(mem.ts_sz_) {
377
+ ts_sz_(mem.ts_sz_),
378
+ protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
379
+ valid_(false),
380
+ value_pinned_(
381
+ !mem.GetImmutableMemTableOptions()->inplace_update_support),
382
+ arena_mode_(arena != nullptr),
383
+ paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
384
+ allow_data_in_error(mem.moptions_.allow_data_in_errors) {
382
385
  if (use_range_del_table) {
383
386
  iter_ = mem.range_del_table_->GetIterator(arena);
384
387
  } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
@@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator {
406
409
  } else {
407
410
  delete iter_;
408
411
  }
412
+ status_.PermitUncheckedError();
409
413
  }
410
414
 
411
415
  #ifndef NDEBUG
@@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator {
415
419
  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
416
420
  #endif
417
421
 
418
- bool Valid() const override { return valid_ && status_.ok(); }
422
+ bool Valid() const override {
423
+ // If inner iter_ is not valid, then this iter should also not be valid.
424
+ assert(iter_->Valid() || !(valid_ && status_.ok()));
425
+ return valid_ && status_.ok();
426
+ }
427
+
419
428
  void Seek(const Slice& k) override {
420
429
  PERF_TIMER_GUARD(seek_on_memtable_time);
421
430
  PERF_COUNTER_ADD(seek_on_memtable_count, 1);
431
+ status_ = Status::OK();
422
432
  if (bloom_) {
423
433
  // iterator should only use prefix bloom filter
424
434
  Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
@@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator {
433
443
  }
434
444
  }
435
445
  }
436
- iter_->Seek(k, nullptr);
446
+ if (paranoid_memory_checks_) {
447
+ status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
448
+ } else {
449
+ iter_->Seek(k, nullptr);
450
+ }
437
451
  valid_ = iter_->Valid();
438
452
  VerifyEntryChecksum();
439
453
  }
440
454
  void SeekForPrev(const Slice& k) override {
441
455
  PERF_TIMER_GUARD(seek_on_memtable_time);
442
456
  PERF_COUNTER_ADD(seek_on_memtable_count, 1);
457
+ status_ = Status::OK();
443
458
  if (bloom_) {
444
459
  Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
445
460
  if (prefix_extractor_->InDomain(user_k_without_ts)) {
@@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator {
453
468
  }
454
469
  }
455
470
  }
456
- iter_->Seek(k, nullptr);
471
+ if (paranoid_memory_checks_) {
472
+ status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
473
+ } else {
474
+ iter_->Seek(k, nullptr);
475
+ }
457
476
  valid_ = iter_->Valid();
458
477
  VerifyEntryChecksum();
459
478
  if (!Valid() && status().ok()) {
@@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator {
464
483
  }
465
484
  }
466
485
  void SeekToFirst() override {
486
+ status_ = Status::OK();
467
487
  iter_->SeekToFirst();
468
488
  valid_ = iter_->Valid();
469
489
  VerifyEntryChecksum();
470
490
  }
471
491
  void SeekToLast() override {
492
+ status_ = Status::OK();
472
493
  iter_->SeekToLast();
473
494
  valid_ = iter_->Valid();
474
495
  VerifyEntryChecksum();
@@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator {
476
497
  void Next() override {
477
498
  PERF_COUNTER_ADD(next_on_memtable_count, 1);
478
499
  assert(Valid());
479
- iter_->Next();
480
- TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
500
+ if (paranoid_memory_checks_) {
501
+ status_ = iter_->NextAndValidate(allow_data_in_error);
502
+ } else {
503
+ iter_->Next();
504
+ TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
505
+ }
481
506
  valid_ = iter_->Valid();
482
507
  VerifyEntryChecksum();
483
508
  }
@@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator {
494
519
  void Prev() override {
495
520
  PERF_COUNTER_ADD(prev_on_memtable_count, 1);
496
521
  assert(Valid());
497
- iter_->Prev();
522
+ if (paranoid_memory_checks_) {
523
+ status_ = iter_->PrevAndValidate(allow_data_in_error);
524
+ } else {
525
+ iter_->Prev();
526
+ }
498
527
  valid_ = iter_->Valid();
499
528
  VerifyEntryChecksum();
500
529
  }
@@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator {
540
569
  const SliceTransform* const prefix_extractor_;
541
570
  const MemTable::KeyComparator comparator_;
542
571
  MemTableRep::Iterator* iter_;
543
- bool valid_;
544
572
  // The seqno to time mapping is owned by the SuperVersion.
545
573
  UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
546
- bool arena_mode_;
547
- bool value_pinned_;
548
- uint32_t protection_bytes_per_key_;
549
574
  Status status_;
550
575
  Logger* logger_;
551
576
  size_t ts_sz_;
577
+ uint32_t protection_bytes_per_key_;
578
+ bool valid_;
579
+ bool value_pinned_;
580
+ bool arena_mode_;
581
+ const bool paranoid_memory_checks_;
582
+ const bool allow_data_in_error;
552
583
 
553
584
  void VerifyEntryChecksum() {
554
585
  if (protection_bytes_per_key_ > 0 && Valid()) {
@@ -933,6 +964,8 @@ static bool SaveValue(void* arg, const char* entry) {
933
964
  Saver* s = static_cast<Saver*>(arg);
934
965
  assert(s != nullptr);
935
966
  assert(!s->value || !s->columns);
967
+ assert(!*(s->found_final_value));
968
+ assert(s->status->ok() || s->status->IsMergeInProgress());
936
969
 
937
970
  MergeContext* merge_context = s->merge_context;
938
971
  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
@@ -966,6 +999,7 @@ static bool SaveValue(void* arg, const char* entry) {
966
999
  *(s->status) = MemTable::VerifyEntryChecksum(
967
1000
  entry, s->protection_bytes_per_key, s->allow_data_in_errors);
968
1001
  if (!s->status->ok()) {
1002
+ *(s->found_final_value) = true;
969
1003
  ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
970
1004
  // Memtable entry corrupted
971
1005
  return false;
@@ -1231,6 +1265,7 @@ static bool SaveValue(void* arg, const char* entry) {
1231
1265
  ". ");
1232
1266
  msg.append("seq: " + std::to_string(seq) + ".");
1233
1267
  }
1268
+ *(s->found_final_value) = true;
1234
1269
  *(s->status) = Status::Corruption(msg.c_str());
1235
1270
  return false;
1236
1271
  }
@@ -1310,8 +1345,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
1310
1345
 
1311
1346
  // No change to value, since we have not yet found a Put/Delete
1312
1347
  // Propagate corruption error
1313
- if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
1314
- *s = Status::MergeInProgress();
1348
+ if (!found_final_value && merge_in_progress) {
1349
+ if (s->ok()) {
1350
+ *s = Status::MergeInProgress();
1351
+ } else {
1352
+ assert(s->IsMergeInProgress());
1353
+ }
1315
1354
  }
1316
1355
  PERF_COUNTER_ADD(get_from_memtable_count, 1);
1317
1356
  return found_final_value;
@@ -1347,7 +1386,19 @@ void MemTable::GetFromTable(const LookupKey& key,
1347
1386
  saver.do_merge = do_merge;
1348
1387
  saver.allow_data_in_errors = moptions_.allow_data_in_errors;
1349
1388
  saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
1350
- table_->Get(key, &saver, SaveValue);
1389
+
1390
+ if (!moptions_.paranoid_memory_checks) {
1391
+ table_->Get(key, &saver, SaveValue);
1392
+ } else {
1393
+ Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
1394
+ moptions_.allow_data_in_errors);
1395
+ if (check_s.IsCorruption()) {
1396
+ *(saver.status) = check_s;
1397
+ // Should stop searching the LSM.
1398
+ *(saver.found_final_value) = true;
1399
+ }
1400
+ }
1401
+ assert(s->ok() || s->IsMergeInProgress() || *found_final_value);
1351
1402
  *seq = saver.seq;
1352
1403
  }
1353
1404
 
@@ -1421,10 +1472,19 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
1421
1472
  &found_final_value, &merge_in_progress);
1422
1473
 
1423
1474
  if (!found_final_value && merge_in_progress) {
1424
- *(iter->s) = Status::MergeInProgress();
1475
+ if (iter->s->ok()) {
1476
+ *(iter->s) = Status::MergeInProgress();
1477
+ } else {
1478
+ assert(iter->s->IsMergeInProgress());
1479
+ }
1425
1480
  }
1426
1481
 
1427
- if (found_final_value) {
1482
+ if (found_final_value ||
1483
+ (!iter->s->ok() && !iter->s->IsMergeInProgress())) {
1484
+ // `found_final_value` should be set if an error/corruption occurs.
1485
+ // The check on iter->s is just there in case GetFromTable() did not
1486
+ // set `found_final_value` properly.
1487
+ assert(found_final_value);
1428
1488
  if (iter->value) {
1429
1489
  iter->value->PinSelf();
1430
1490
  range->AddValueSize(iter->value->size());
@@ -60,8 +60,9 @@ struct ImmutableMemTableOptions {
60
60
  Statistics* statistics;
61
61
  MergeOperator* merge_operator;
62
62
  Logger* info_log;
63
- bool allow_data_in_errors;
64
63
  uint32_t protection_bytes_per_key;
64
+ bool allow_data_in_errors;
65
+ bool paranoid_memory_checks;
65
66
  };
66
67
 
67
68
  // Batched counters to updated when inserting keys in one write batch.
@@ -249,12 +250,14 @@ class MemTable {
249
250
  // If do_merge = true the default behavior which is Get value for key is
250
251
  // executed. Expected behavior is described right below.
251
252
  // If memtable contains a value for key, store it in *value and return true.
252
- // If memtable contains a deletion for key, store a NotFound() error
253
- // in *status and return true.
253
+ // If memtable contains a deletion for key, store NotFound() in *status and
254
+ // return true.
254
255
  // If memtable contains Merge operation as the most recent entry for a key,
255
256
  // and the merge process does not stop (not reaching a value or delete),
256
257
  // prepend the current merge operand to *operands.
257
258
  // store MergeInProgress in s, and return false.
259
+ // If an unexpected error or corruption occurs, store Corruption() or other
260
+ // error in *status and return true.
258
261
  // Else, return false.
259
262
  // If any operation was found, its most recent sequence number
260
263
  // will be stored in *seq on success (regardless of whether true/false is
@@ -264,6 +267,11 @@ class MemTable {
264
267
  // If do_merge = false then any Merge Operands encountered for key are simply
265
268
  // stored in merge_context.operands_list and never actually merged to get a
266
269
  // final value. The raw Merge Operands are eventually returned to the user.
270
+ // @param value If not null and memtable contains a value for key, `value`
271
+ // will be set to the result value.
272
+ // @param column If not null and memtable contains a value/WideColumn for key,
273
+ // `column` will be set to the result value/WideColumn.
274
+ // Note: only one of `value` and `column` can be non-nullptr.
267
275
  // @param immutable_memtable Whether this memtable is immutable. Used
268
276
  // internally by NewRangeTombstoneIterator(). See comment above
269
277
  // NewRangeTombstoneIterator() for more detail.
@@ -181,7 +181,8 @@ bool MemTableListVersion::GetFromList(
181
181
  }
182
182
 
183
183
  if (done) {
184
- assert(*seq != kMaxSequenceNumber || s->IsNotFound());
184
+ assert(*seq != kMaxSequenceNumber ||
185
+ (!s->ok() && !s->IsMergeInProgress()));
185
186
  return true;
186
187
  }
187
188
  if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
@@ -558,11 +559,12 @@ Status MemTableList::TryInstallMemtableFlushResults(
558
559
  batch_file_number = m->file_number_;
559
560
  if (m->edit_.GetBlobFileAdditions().empty()) {
560
561
  ROCKS_LOG_BUFFER(log_buffer,
561
- "[%s] Level-0 commit table #%" PRIu64 " started",
562
+ "[%s] Level-0 commit flush result of table #%" PRIu64
563
+ " started",
562
564
  cfd->GetName().c_str(), m->file_number_);
563
565
  } else {
564
566
  ROCKS_LOG_BUFFER(log_buffer,
565
- "[%s] Level-0 commit table #%" PRIu64
567
+ "[%s] Level-0 commit flush result of table #%" PRIu64
566
568
  " (+%zu blob files) started",
567
569
  cfd->GetName().c_str(), m->file_number_,
568
570
  m->edit_.GetBlobFileAdditions().size());
@@ -757,12 +759,12 @@ void MemTableList::RemoveMemTablesOrRestoreFlags(
757
759
  MemTable* m = current_->memlist_.back();
758
760
  if (m->edit_.GetBlobFileAdditions().empty()) {
759
761
  ROCKS_LOG_BUFFER(log_buffer,
760
- "[%s] Level-0 commit table #%" PRIu64
762
+ "[%s] Level-0 commit flush result of table #%" PRIu64
761
763
  ": memtable #%" PRIu64 " done",
762
764
  cfd->GetName().c_str(), m->file_number_, mem_id);
763
765
  } else {
764
766
  ROCKS_LOG_BUFFER(log_buffer,
765
- "[%s] Level-0 commit table #%" PRIu64
767
+ "[%s] Level-0 commit flush result of table #%" PRIu64
766
768
  " (+%zu blob files)"
767
769
  ": memtable #%" PRIu64 " done",
768
770
  cfd->GetName().c_str(), m->file_number_,
@@ -287,6 +287,7 @@ TEST_F(MemTableListTest, GetTest) {
287
287
 
288
288
  // Fetch the newly written keys
289
289
  merge_context.Clear();
290
+ s = Status::OK();
290
291
  found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
291
292
  /*timestamp*/ nullptr, &s, &merge_context,
292
293
  &max_covering_tombstone_seq, ReadOptions(),
@@ -295,6 +296,7 @@ TEST_F(MemTableListTest, GetTest) {
295
296
  ASSERT_EQ(value, "value1");
296
297
 
297
298
  merge_context.Clear();
299
+ s = Status::OK();
298
300
  found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
299
301
  /*timestamp*/ nullptr, &s, &merge_context,
300
302
  &max_covering_tombstone_seq, ReadOptions(),
@@ -303,6 +305,7 @@ TEST_F(MemTableListTest, GetTest) {
303
305
  ASSERT_TRUE(found && s.IsNotFound());
304
306
 
305
307
  merge_context.Clear();
308
+ s = Status::OK();
306
309
  found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
307
310
  /*timestamp*/ nullptr, &s, &merge_context,
308
311
  &max_covering_tombstone_seq, ReadOptions(),
@@ -311,6 +314,7 @@ TEST_F(MemTableListTest, GetTest) {
311
314
  ASSERT_EQ(value, "value2.2");
312
315
 
313
316
  merge_context.Clear();
317
+ s = Status::OK();
314
318
  found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr,
315
319
  /*timestamp*/ nullptr, &s, &merge_context,
316
320
  &max_covering_tombstone_seq, ReadOptions(),
@@ -350,6 +354,7 @@ TEST_F(MemTableListTest, GetTest) {
350
354
 
351
355
  // Fetch keys via MemTableList
352
356
  merge_context.Clear();
357
+ s = Status::OK();
353
358
  found =
354
359
  list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
355
360
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -357,6 +362,7 @@ TEST_F(MemTableListTest, GetTest) {
357
362
  ASSERT_TRUE(found && s.IsNotFound());
358
363
 
359
364
  merge_context.Clear();
365
+ s = Status::OK();
360
366
  found = list.current()->Get(LookupKey("key1", saved_seq), &value,
361
367
  /*columns=*/nullptr, /*timestamp=*/nullptr, &s,
362
368
  &merge_context, &max_covering_tombstone_seq,
@@ -365,6 +371,7 @@ TEST_F(MemTableListTest, GetTest) {
365
371
  ASSERT_EQ("value1", value);
366
372
 
367
373
  merge_context.Clear();
374
+ s = Status::OK();
368
375
  found =
369
376
  list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
370
377
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -373,12 +380,14 @@ TEST_F(MemTableListTest, GetTest) {
373
380
  ASSERT_EQ(value, "value2.3");
374
381
 
375
382
  merge_context.Clear();
383
+ s = Status::OK();
376
384
  found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
377
385
  /*timestamp=*/nullptr, &s, &merge_context,
378
386
  &max_covering_tombstone_seq, ReadOptions());
379
387
  ASSERT_FALSE(found);
380
388
 
381
389
  merge_context.Clear();
390
+ s = Status::OK();
382
391
  found =
383
392
  list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
384
393
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -438,6 +447,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
438
447
 
439
448
  // Fetch the newly written keys
440
449
  merge_context.Clear();
450
+ s = Status::OK();
441
451
  found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
442
452
  /*timestamp*/ nullptr, &s, &merge_context,
443
453
  &max_covering_tombstone_seq, ReadOptions(),
@@ -446,6 +456,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
446
456
  ASSERT_TRUE(found && s.IsNotFound());
447
457
 
448
458
  merge_context.Clear();
459
+ s = Status::OK();
449
460
  found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
450
461
  /*timestamp*/ nullptr, &s, &merge_context,
451
462
  &max_covering_tombstone_seq, ReadOptions(),
@@ -462,6 +473,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
462
473
 
463
474
  // Fetch keys via MemTableList
464
475
  merge_context.Clear();
476
+ s = Status::OK();
465
477
  found =
466
478
  list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
467
479
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -469,6 +481,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
469
481
  ASSERT_TRUE(found && s.IsNotFound());
470
482
 
471
483
  merge_context.Clear();
484
+ s = Status::OK();
472
485
  found =
473
486
  list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
474
487
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -508,6 +521,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
508
521
 
509
522
  // Verify keys are present in history
510
523
  merge_context.Clear();
524
+ s = Status::OK();
511
525
  found = list.current()->GetFromHistory(
512
526
  LookupKey("key1", seq), &value, /*columns=*/nullptr,
513
527
  /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@@ -515,6 +529,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
515
529
  ASSERT_TRUE(found && s.IsNotFound());
516
530
 
517
531
  merge_context.Clear();
532
+ s = Status::OK();
518
533
  found = list.current()->GetFromHistory(
519
534
  LookupKey("key2", seq), &value, /*columns=*/nullptr,
520
535
  /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@@ -568,6 +583,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
568
583
 
569
584
  // Verify keys are no longer in MemTableList
570
585
  merge_context.Clear();
586
+ s = Status::OK();
571
587
  found =
572
588
  list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
573
589
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -575,6 +591,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
575
591
  ASSERT_FALSE(found);
576
592
 
577
593
  merge_context.Clear();
594
+ s = Status::OK();
578
595
  found =
579
596
  list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
580
597
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -582,6 +599,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
582
599
  ASSERT_FALSE(found);
583
600
 
584
601
  merge_context.Clear();
602
+ s = Status::OK();
585
603
  found =
586
604
  list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
587
605
  /*timestamp=*/nullptr, &s, &merge_context,
@@ -590,6 +608,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
590
608
 
591
609
  // Verify that the second memtable's keys are in the history
592
610
  merge_context.Clear();
611
+ s = Status::OK();
593
612
  found = list.current()->GetFromHistory(
594
613
  LookupKey("key1", seq), &value, /*columns=*/nullptr,
595
614
  /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@@ -597,6 +616,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
597
616
  ASSERT_TRUE(found && s.IsNotFound());
598
617
 
599
618
  merge_context.Clear();
619
+ s = Status::OK();
600
620
  found = list.current()->GetFromHistory(
601
621
  LookupKey("key3", seq), &value, /*columns=*/nullptr,
602
622
  /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
@@ -606,6 +626,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
606
626
 
607
627
  // Verify that key2 from the first memtable is no longer in the history
608
628
  merge_context.Clear();
629
+ s = Status::OK();
609
630
  found =
610
631
  list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
611
632
  /*timestamp=*/nullptr, &s, &merge_context,