@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/binding.cc +32 -2
  2. package/binding.gyp +8 -0
  3. package/deps/liburing/liburing.gyp +20 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +7 -0
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
  8. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
  10. package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
  12. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
  19. package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
  20. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
  23. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
  25. package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
  26. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
  30. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
  31. package/deps/rocksdb/rocksdb/db/c.cc +90 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
  33. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  34. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
  42. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
  43. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
  44. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
  45. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
  52. package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
  53. package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
  54. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
  55. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
  57. package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
  58. package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
  59. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
  61. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
  62. package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
  63. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  64. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
  65. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
  66. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
  67. package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
  68. package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
  69. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  70. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  71. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
  72. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
  73. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
  74. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
  75. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
  76. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
  78. package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
  80. package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
  82. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
  89. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
  94. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
  95. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
  96. package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
  97. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
  98. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
  99. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
  100. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
  101. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
  102. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
  104. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
  105. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
  106. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
  108. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
  109. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
  110. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  113. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
  114. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
  115. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
  116. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
  117. package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
  118. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
  119. package/deps/rocksdb/rocksdb/src.mk +4 -0
  120. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
  121. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
  122. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
  124. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
  125. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
  126. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  128. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
  129. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  131. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
  132. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
  133. package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
  134. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
  135. package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
  136. package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
  137. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
  138. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
  139. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
  140. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
  141. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
  143. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
  144. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
  145. package/deps/rocksdb/rocksdb.gyp +7 -1
  146. package/package.json +1 -1
  147. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -104,15 +104,20 @@ class InternalStats {
104
104
  static const std::map<LevelStatType, LevelStat> compaction_level_stats;
105
105
 
106
106
  enum InternalCFStatsType {
107
- L0_FILE_COUNT_LIMIT_SLOWDOWNS,
108
- LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
107
+ MEMTABLE_LIMIT_DELAYS,
109
108
  MEMTABLE_LIMIT_STOPS,
110
- MEMTABLE_LIMIT_SLOWDOWNS,
109
+ L0_FILE_COUNT_LIMIT_DELAYS,
111
110
  L0_FILE_COUNT_LIMIT_STOPS,
112
- LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
113
- PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
111
+ PENDING_COMPACTION_BYTES_LIMIT_DELAYS,
114
112
  PENDING_COMPACTION_BYTES_LIMIT_STOPS,
113
+ // Write slowdown caused by l0 file count limit while there is ongoing L0
114
+ // compaction
115
+ L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION,
116
+ // Write stop caused by l0 file count limit while there is ongoing L0
117
+ // compaction
118
+ L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION,
115
119
  WRITE_STALLS_ENUM_MAX,
120
+ // End of all write stall stats
116
121
  BYTES_FLUSHED,
117
122
  BYTES_INGESTED_ADD_FILE,
118
123
  INGESTED_NUM_FILES_TOTAL,
@@ -129,7 +134,18 @@ class InternalStats {
129
134
  kIntStatsWriteDoneByOther,
130
135
  kIntStatsWriteDoneBySelf,
131
136
  kIntStatsWriteWithWal,
137
+ // TODO(hx235): Currently `kIntStatsWriteStallMicros` only measures
138
+ // "delayed" time of CF-scope write stalls, not including the "stopped" time
139
+ // nor any DB-scope write stalls (e.g, ones triggered by
140
+ // `WriteBufferManager`).
141
+ //
142
+ // However, the word "write stall" includes both "delayed" and "stopped"
143
+ // (see `WriteStallCondition`) and DB-scope writes stalls (see
144
+ // `WriteStallCause`).
145
+ //
146
+ // So we should improve, rename or clarify it
132
147
  kIntStatsWriteStallMicros,
148
+ kIntStatsWriteBufferManagerLimitStopsCounts,
133
149
  kIntStatsNumMax,
134
150
  };
135
151
 
@@ -599,6 +615,10 @@ class InternalStats {
599
615
  private:
600
616
  void DumpDBMapStats(std::map<std::string, std::string>* db_stats);
601
617
  void DumpDBStats(std::string* value);
618
+
619
+ void DumpDBMapStatsWriteStall(std::map<std::string, std::string>* value);
620
+ void DumpDBStatsWriteStall(std::string* value);
621
+
602
622
  void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
603
623
  void DumpCFMapStats(
604
624
  const VersionStorageInfo* vstorage,
@@ -606,7 +626,6 @@ class InternalStats {
606
626
  CompactionStats* compaction_stats_sum);
607
627
  void DumpCFMapStatsByPriority(
608
628
  std::map<int, std::map<LevelStatType, double>>* priorities_stats);
609
- void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
610
629
  void DumpCFStats(std::string* value);
611
630
  // if is_periodic = true, it is an internal call by RocksDB periodically to
612
631
  // dump the status.
@@ -615,6 +634,10 @@ class InternalStats {
615
634
  // dump the status.
616
635
  void DumpCFFileHistogram(std::string* value);
617
636
 
637
+ void DumpCFMapStatsWriteStall(std::map<std::string, std::string>* value);
638
+ void DumpCFStatsWriteStall(std::string* value,
639
+ uint64_t* total_stall_count = nullptr);
640
+
618
641
  Cache* GetBlockCacheForStats();
619
642
  Cache* GetBlobCacheForStats();
620
643
 
@@ -648,7 +671,7 @@ class InternalStats {
648
671
  // ColumnFamily-level stats
649
672
  CompactionStats comp_stats;
650
673
  uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush)
651
- uint64_t stall_count; // Stall count
674
+ uint64_t stall_count; // Total counts of CF-scope write stalls
652
675
  // Stats from compaction jobs - bytes written, bytes read, duration.
653
676
  uint64_t compact_bytes_write;
654
677
  uint64_t compact_bytes_read;
@@ -743,9 +766,15 @@ class InternalStats {
743
766
  bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
744
767
  bool HandleCFFileHistogram(std::string* value, Slice suffix);
745
768
  bool HandleCFStatsPeriodic(std::string* value, Slice suffix);
769
+ bool HandleCFWriteStallStats(std::string* value, Slice suffix);
770
+ bool HandleCFWriteStallStatsMap(std::map<std::string, std::string>* values,
771
+ Slice suffix);
746
772
  bool HandleDBMapStats(std::map<std::string, std::string>* compaction_stats,
747
773
  Slice suffix);
748
774
  bool HandleDBStats(std::string* value, Slice suffix);
775
+ bool HandleDBWriteStallStats(std::string* value, Slice suffix);
776
+ bool HandleDBWriteStallStatsMap(std::map<std::string, std::string>* values,
777
+ Slice suffix);
749
778
  bool HandleSsTables(std::string* value, Slice suffix);
750
779
  bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
751
780
  bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
@@ -1078,7 +1078,7 @@ static bool SaveValue(void* arg, const char* entry) {
1078
1078
  *(s->value) = std::move(result);
1079
1079
  } else {
1080
1080
  assert(s->columns);
1081
- s->columns->SetPlainValue(result);
1081
+ s->columns->SetPlainValue(std::move(result));
1082
1082
  }
1083
1083
  }
1084
1084
  }
@@ -1152,7 +1152,7 @@ static bool SaveValue(void* arg, const char* entry) {
1152
1152
  /* op_failure_scope */ nullptr);
1153
1153
 
1154
1154
  if (s->status->ok()) {
1155
- *(s->status) = s->columns->SetWideColumnValue(result);
1155
+ *(s->status) = s->columns->SetWideColumnValue(std::move(result));
1156
1156
  }
1157
1157
  }
1158
1158
  } else if (s->value) {
@@ -1200,7 +1200,7 @@ static bool SaveValue(void* arg, const char* entry) {
1200
1200
  *(s->value) = std::move(result);
1201
1201
  } else {
1202
1202
  assert(s->columns);
1203
- s->columns->SetPlainValue(result);
1203
+ s->columns->SetPlainValue(std::move(result));
1204
1204
  }
1205
1205
  }
1206
1206
  } else {
@@ -1230,6 +1230,8 @@ static bool SaveValue(void* arg, const char* entry) {
1230
1230
  *(s->merge_in_progress) = true;
1231
1231
  merge_context->PushOperand(
1232
1232
  v, s->inplace_update_support == false /* operand_pinned */);
1233
+ PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1);
1234
+
1233
1235
  if (s->do_merge && merge_operator->ShouldMerge(
1234
1236
  merge_context->GetOperandsDirectionBackward())) {
1235
1237
  if (s->value || s->columns) {
@@ -1249,7 +1251,7 @@ static bool SaveValue(void* arg, const char* entry) {
1249
1251
  *(s->value) = std::move(result);
1250
1252
  } else {
1251
1253
  assert(s->columns);
1252
- s->columns->SetPlainValue(result);
1254
+ s->columns->SetPlainValue(std::move(result));
1253
1255
  }
1254
1256
  }
1255
1257
  }
@@ -231,6 +231,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
231
231
  s = Status::ShutdownInProgress();
232
232
  return s;
233
233
  }
234
+ // Skip range tombstones emitted by the compaction iterator.
235
+ if (iter->IsDeleteRangeSentinelKey()) {
236
+ continue;
237
+ }
234
238
 
235
239
  ParsedInternalKey ikey;
236
240
  assert(keys_.size() == merge_context_.GetNumOperands());
@@ -964,6 +964,157 @@ TEST_F(PerfContextTest, CPUTimer) {
964
964
  ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
965
965
  }
966
966
  }
967
+
968
+ TEST_F(PerfContextTest, MergeOperandCount) {
969
+ ASSERT_OK(DestroyDB(kDbName, Options()));
970
+
971
+ DB* db = nullptr;
972
+ Options options;
973
+ options.create_if_missing = true;
974
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
975
+
976
+ ASSERT_OK(DB::Open(options, kDbName, &db));
977
+ std::unique_ptr<DB> db_guard(db);
978
+
979
+ constexpr size_t num_keys = 3;
980
+ const std::string key_prefix("key");
981
+ const std::string value_prefix("value");
982
+
983
+ std::vector<std::string> keys;
984
+ keys.reserve(num_keys);
985
+
986
+ for (size_t i = 0; i < num_keys; ++i) {
987
+ keys.emplace_back(key_prefix + std::to_string(i));
988
+ }
989
+
990
+ // Write three keys with one Put each followed by 1, 2, and 3
991
+ // Merge operations respectively.
992
+ constexpr size_t total_merges = num_keys * (num_keys + 1) / 2;
993
+
994
+ std::vector<ManagedSnapshot> snapshots;
995
+ snapshots.reserve(total_merges);
996
+
997
+ for (size_t i = 0; i < num_keys; ++i) {
998
+ const std::string suffix = std::to_string(i);
999
+ const std::string value = value_prefix + suffix;
1000
+
1001
+ ASSERT_OK(db->Put(WriteOptions(), keys[i], value));
1002
+
1003
+ for (size_t j = 0; j <= i; ++j) {
1004
+ // Take a snapshot before each Merge so they are preserved and not
1005
+ // collapsed during flush.
1006
+ snapshots.emplace_back(db);
1007
+
1008
+ ASSERT_OK(db->Merge(WriteOptions(), keys[i], value + std::to_string(j)));
1009
+ }
1010
+ }
1011
+
1012
+ auto verify = [&]() {
1013
+ get_perf_context()->Reset();
1014
+
1015
+ for (size_t i = 0; i < num_keys; ++i) {
1016
+ // Get
1017
+ {
1018
+ PinnableSlice result;
1019
+ ASSERT_OK(db->Get(ReadOptions(), db->DefaultColumnFamily(), keys[i],
1020
+ &result));
1021
+ ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, i + 1);
1022
+
1023
+ get_perf_context()->Reset();
1024
+ }
1025
+
1026
+ // GetEntity
1027
+ {
1028
+ PinnableWideColumns result;
1029
+ ASSERT_OK(db->GetEntity(ReadOptions(), db->DefaultColumnFamily(),
1030
+ keys[i], &result));
1031
+ ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, i + 1);
1032
+
1033
+ get_perf_context()->Reset();
1034
+ }
1035
+ }
1036
+
1037
+ {
1038
+ std::vector<Slice> key_slices;
1039
+ key_slices.reserve(num_keys);
1040
+
1041
+ for (size_t i = 0; i < num_keys; ++i) {
1042
+ key_slices.emplace_back(keys[i]);
1043
+ }
1044
+
1045
+ // MultiGet
1046
+ {
1047
+ std::vector<PinnableSlice> results(num_keys);
1048
+ std::vector<Status> statuses(num_keys);
1049
+
1050
+ db->MultiGet(ReadOptions(), db->DefaultColumnFamily(), num_keys,
1051
+ &key_slices[0], &results[0], &statuses[0]);
1052
+
1053
+ for (size_t i = 0; i < num_keys; ++i) {
1054
+ ASSERT_OK(statuses[i]);
1055
+ }
1056
+
1057
+ ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count,
1058
+ total_merges);
1059
+
1060
+ get_perf_context()->Reset();
1061
+ }
1062
+
1063
+ // MultiGetEntity
1064
+ {
1065
+ std::vector<PinnableWideColumns> results(num_keys);
1066
+ std::vector<Status> statuses(num_keys);
1067
+
1068
+ db->MultiGetEntity(ReadOptions(), db->DefaultColumnFamily(), num_keys,
1069
+ &key_slices[0], &results[0], &statuses[0]);
1070
+
1071
+ for (size_t i = 0; i < num_keys; ++i) {
1072
+ ASSERT_OK(statuses[i]);
1073
+ }
1074
+
1075
+ ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count,
1076
+ total_merges);
1077
+
1078
+ get_perf_context()->Reset();
1079
+ }
1080
+ }
1081
+
1082
+ std::unique_ptr<Iterator> it(db->NewIterator(ReadOptions()));
1083
+
1084
+ // Forward iteration
1085
+ {
1086
+ size_t i = 0;
1087
+
1088
+ for (it->SeekToFirst(); it->Valid(); it->Next(), ++i) {
1089
+ ASSERT_EQ(it->key(), keys[i]);
1090
+ ASSERT_EQ(get_perf_context()->internal_merge_count, i + 1);
1091
+
1092
+ get_perf_context()->Reset();
1093
+ }
1094
+ }
1095
+
1096
+ // Backward iteration
1097
+ {
1098
+ size_t i = num_keys - 1;
1099
+
1100
+ for (it->SeekToLast(); it->Valid(); it->Prev(), --i) {
1101
+ ASSERT_EQ(it->key(), keys[i]);
1102
+ ASSERT_EQ(get_perf_context()->internal_merge_count, i + 1);
1103
+
1104
+ get_perf_context()->Reset();
1105
+ }
1106
+ }
1107
+ };
1108
+
1109
+ // Verify counters when reading from memtable
1110
+ verify();
1111
+
1112
+ // Verify counters when reading from table files
1113
+ db->Flush(FlushOptions());
1114
+
1115
+ verify();
1116
+ }
1117
+
967
1118
  } // namespace ROCKSDB_NAMESPACE
968
1119
 
969
1120
  int main(int argc, char** argv) {
@@ -30,12 +30,15 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
30
30
  icmp_(icmp),
31
31
  smallest_ikey_(smallest),
32
32
  largest_ikey_(largest) {
33
+ // Set up bounds such that range tombstones from this iterator are
34
+ // truncated to range [smallest_, largest_).
33
35
  if (smallest != nullptr) {
34
36
  pinned_bounds_.emplace_back();
35
37
  auto& parsed_smallest = pinned_bounds_.back();
36
38
  Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
37
39
  false /* log_err_key */); // TODO
38
40
  pik_status.PermitUncheckedError();
41
+ parsed_smallest.type = kTypeMaxValid;
39
42
  assert(pik_status.ok());
40
43
  smallest_ = &parsed_smallest;
41
44
  }
@@ -63,6 +66,8 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
63
66
  //
64
67
  // Therefore, we will never truncate a range tombstone at largest, so we
65
68
  // can leave it unchanged.
69
+ // TODO: maybe use kMaxValid here to ensure range tombstone having
70
+ // distinct key from point keys.
66
71
  } else {
67
72
  // The same user key may straddle two sstable boundaries. To ensure that
68
73
  // the truncated end key can cover the largest key in this sstable, reduce
@@ -70,7 +75,7 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
70
75
  parsed_largest.sequence -= 1;
71
76
  // This line is not needed for correctness, but it ensures that the
72
77
  // truncated end key is not covering keys from the next SST file.
73
- parsed_largest.type = kValueTypeForSeek;
78
+ parsed_largest.type = kTypeMaxValid;
74
79
  }
75
80
  largest_ = &parsed_largest;
76
81
  }
@@ -101,6 +106,24 @@ void TruncatedRangeDelIterator::Seek(const Slice& target) {
101
106
  iter_->Seek(target);
102
107
  }
103
108
 
109
+ void TruncatedRangeDelIterator::SeekInternalKey(const Slice& target) {
110
+ if (largest_ && icmp_->Compare(*largest_, target) <= 0) {
111
+ iter_->Invalidate();
112
+ return;
113
+ }
114
+ if (smallest_ && icmp_->Compare(target, *smallest_) < 0) {
115
+ // Since target < smallest, target < largest_.
116
+ // This seek must land on a range tombstone where end_key() > target,
117
+ // so there is no need to check again.
118
+ iter_->Seek(smallest_->user_key);
119
+ } else {
120
+ iter_->Seek(ExtractUserKey(target));
121
+ while (Valid() && icmp_->Compare(end_key(), target) <= 0) {
122
+ Next();
123
+ }
124
+ }
125
+ }
126
+
104
127
  // NOTE: target is a user key, with timestamp if enabled.
105
128
  void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
106
129
  if (smallest_ != nullptr &&
@@ -393,21 +416,20 @@ bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
393
416
  namespace {
394
417
 
395
418
  // Produce a sorted (by start internal key) stream of range tombstones from
396
- // `children`. lower_bound and upper_bound on user key can be
419
+ // `children`. lower_bound and upper_bound on internal key can be
397
420
  // optionally specified. Range tombstones that ends before lower_bound or starts
398
421
  // after upper_bound are excluded.
399
422
  // If user-defined timestamp is enabled, lower_bound and upper_bound should
400
- // contain timestamp, but comparison is done ignoring timestamps.
423
+ // contain timestamp.
401
424
  class TruncatedRangeDelMergingIter : public InternalIterator {
402
425
  public:
403
426
  TruncatedRangeDelMergingIter(
404
427
  const InternalKeyComparator* icmp, const Slice* lower_bound,
405
- const Slice* upper_bound, bool upper_bound_inclusive,
428
+ const Slice* upper_bound,
406
429
  const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
407
430
  : icmp_(icmp),
408
431
  lower_bound_(lower_bound),
409
432
  upper_bound_(upper_bound),
410
- upper_bound_inclusive_(upper_bound_inclusive),
411
433
  heap_(StartKeyMinComparator(icmp)),
412
434
  ts_sz_(icmp_->user_comparator()->timestamp_size()) {
413
435
  for (auto& child : children) {
@@ -420,7 +442,7 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
420
442
  }
421
443
 
422
444
  bool Valid() const override {
423
- return !heap_.empty() && BeforeEndKey(heap_.top());
445
+ return !heap_.empty() && !AfterEndKey(heap_.top());
424
446
  }
425
447
  Status status() const override { return Status::OK(); }
426
448
 
@@ -428,7 +450,13 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
428
450
  heap_.clear();
429
451
  for (auto& child : children_) {
430
452
  if (lower_bound_ != nullptr) {
431
- child->Seek(*lower_bound_);
453
+ child->Seek(ExtractUserKey(*lower_bound_));
454
+ // Since the above `Seek()` operates on a user key while `lower_bound_`
455
+ // is an internal key, we may need to advance `child` farther for it to
456
+ // be in bounds.
457
+ while (child->Valid() && BeforeStartKey(child)) {
458
+ child->InternalNext();
459
+ }
432
460
  } else {
433
461
  child->SeekToFirst();
434
462
  }
@@ -481,19 +509,23 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
481
509
  void SeekToLast() override { assert(false); }
482
510
 
483
511
  private:
484
- bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
512
+ bool BeforeStartKey(const TruncatedRangeDelIterator* iter) const {
513
+ if (lower_bound_ == nullptr) {
514
+ return false;
515
+ }
516
+ return icmp_->Compare(iter->end_key(), *lower_bound_) <= 0;
517
+ }
518
+
519
+ bool AfterEndKey(const TruncatedRangeDelIterator* iter) const {
485
520
  if (upper_bound_ == nullptr) {
486
- return true;
521
+ return false;
487
522
  }
488
- int cmp = icmp_->user_comparator()->CompareWithoutTimestamp(
489
- iter->start_key().user_key, *upper_bound_);
490
- return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
523
+ return icmp_->Compare(iter->start_key(), *upper_bound_) > 0;
491
524
  }
492
525
 
493
526
  const InternalKeyComparator* icmp_;
494
527
  const Slice* lower_bound_;
495
528
  const Slice* upper_bound_;
496
- bool upper_bound_inclusive_;
497
529
  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
498
530
  std::vector<TruncatedRangeDelIterator*> children_;
499
531
 
@@ -506,11 +538,10 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
506
538
 
507
539
  std::unique_ptr<FragmentedRangeTombstoneIterator>
508
540
  CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
509
- const Slice* upper_bound,
510
- bool upper_bound_inclusive) {
541
+ const Slice* upper_bound) {
511
542
  InvalidateRangeDelMapPositions();
512
543
  auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
513
- icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_);
544
+ icmp_, lower_bound, upper_bound, parent_iters_);
514
545
 
515
546
  auto fragmented_tombstone_list =
516
547
  std::make_shared<FragmentedRangeTombstoneList>(
@@ -49,6 +49,9 @@ class TruncatedRangeDelIterator {
49
49
  // REQUIRES: target is a user key.
50
50
  void Seek(const Slice& target);
51
51
 
52
+ // Seeks to the first range tombstone with end_key() > target.
53
+ void SeekInternalKey(const Slice& target);
54
+
52
55
  // Seeks to the tombstone with the highest visible sequence number that covers
53
56
  // target (a user key). If no such tombstone exists, the position will be at
54
57
  // the latest tombstone that starts before target.
@@ -452,16 +455,15 @@ class CompactionRangeDelAggregator : public RangeDelAggregator {
452
455
  }
453
456
 
454
457
  // Creates an iterator over all the range tombstones in the aggregator, for
455
- // use in compaction. Nullptr arguments indicate that the iterator range is
456
- // unbounded.
457
- // NOTE: the boundaries are used for optimization purposes to reduce the
458
- // number of tombstones that are passed to the fragmenter; they do not
459
- // guarantee that the resulting iterator only contains range tombstones that
460
- // cover keys in the provided range. If required, these bounds must be
458
+ // use in compaction.
459
+ //
460
+ // NOTE: the internal key boundaries are used for optimization purposes to
461
+ // reduce the number of tombstones that are passed to the fragmenter; they do
462
+ // not guarantee that the resulting iterator only contains range tombstones
463
+ // that cover keys in the provided range. If required, these bounds must be
461
464
  // enforced during iteration.
462
465
  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
463
- const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
464
- bool upper_bound_inclusive = false);
466
+ const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr);
465
467
 
466
468
  private:
467
469
  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;