@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +32 -2
- package/binding.gyp +8 -0
- package/deps/liburing/liburing.gyp +20 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
- package/deps/rocksdb/rocksdb/TARGETS +7 -0
- package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
- package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
- package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
- package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
- package/deps/rocksdb/rocksdb/db/c.cc +90 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
- package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
- package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
- package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
- package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
- package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
- package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
- package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
- package/deps/rocksdb/rocksdb/src.mk +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
- package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
- package/deps/rocksdb/rocksdb.gyp +7 -1
- package/package.json +1 -1
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -104,15 +104,20 @@ class InternalStats {
|
|
|
104
104
|
static const std::map<LevelStatType, LevelStat> compaction_level_stats;
|
|
105
105
|
|
|
106
106
|
enum InternalCFStatsType {
|
|
107
|
-
|
|
108
|
-
LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
|
|
107
|
+
MEMTABLE_LIMIT_DELAYS,
|
|
109
108
|
MEMTABLE_LIMIT_STOPS,
|
|
110
|
-
|
|
109
|
+
L0_FILE_COUNT_LIMIT_DELAYS,
|
|
111
110
|
L0_FILE_COUNT_LIMIT_STOPS,
|
|
112
|
-
|
|
113
|
-
PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
|
|
111
|
+
PENDING_COMPACTION_BYTES_LIMIT_DELAYS,
|
|
114
112
|
PENDING_COMPACTION_BYTES_LIMIT_STOPS,
|
|
113
|
+
// Write slowdown caused by l0 file count limit while there is ongoing L0
|
|
114
|
+
// compaction
|
|
115
|
+
L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION,
|
|
116
|
+
// Write stop caused by l0 file count limit while there is ongoing L0
|
|
117
|
+
// compaction
|
|
118
|
+
L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION,
|
|
115
119
|
WRITE_STALLS_ENUM_MAX,
|
|
120
|
+
// End of all write stall stats
|
|
116
121
|
BYTES_FLUSHED,
|
|
117
122
|
BYTES_INGESTED_ADD_FILE,
|
|
118
123
|
INGESTED_NUM_FILES_TOTAL,
|
|
@@ -129,7 +134,18 @@ class InternalStats {
|
|
|
129
134
|
kIntStatsWriteDoneByOther,
|
|
130
135
|
kIntStatsWriteDoneBySelf,
|
|
131
136
|
kIntStatsWriteWithWal,
|
|
137
|
+
// TODO(hx235): Currently `kIntStatsWriteStallMicros` only measures
|
|
138
|
+
// "delayed" time of CF-scope write stalls, not including the "stopped" time
|
|
139
|
+
// nor any DB-scope write stalls (e.g, ones triggered by
|
|
140
|
+
// `WriteBufferManager`).
|
|
141
|
+
//
|
|
142
|
+
// However, the word "write stall" includes both "delayed" and "stopped"
|
|
143
|
+
// (see `WriteStallCondition`) and DB-scope writes stalls (see
|
|
144
|
+
// `WriteStallCause`).
|
|
145
|
+
//
|
|
146
|
+
// So we should improve, rename or clarify it
|
|
132
147
|
kIntStatsWriteStallMicros,
|
|
148
|
+
kIntStatsWriteBufferManagerLimitStopsCounts,
|
|
133
149
|
kIntStatsNumMax,
|
|
134
150
|
};
|
|
135
151
|
|
|
@@ -599,6 +615,10 @@ class InternalStats {
|
|
|
599
615
|
private:
|
|
600
616
|
void DumpDBMapStats(std::map<std::string, std::string>* db_stats);
|
|
601
617
|
void DumpDBStats(std::string* value);
|
|
618
|
+
|
|
619
|
+
void DumpDBMapStatsWriteStall(std::map<std::string, std::string>* value);
|
|
620
|
+
void DumpDBStatsWriteStall(std::string* value);
|
|
621
|
+
|
|
602
622
|
void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
|
|
603
623
|
void DumpCFMapStats(
|
|
604
624
|
const VersionStorageInfo* vstorage,
|
|
@@ -606,7 +626,6 @@ class InternalStats {
|
|
|
606
626
|
CompactionStats* compaction_stats_sum);
|
|
607
627
|
void DumpCFMapStatsByPriority(
|
|
608
628
|
std::map<int, std::map<LevelStatType, double>>* priorities_stats);
|
|
609
|
-
void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
|
|
610
629
|
void DumpCFStats(std::string* value);
|
|
611
630
|
// if is_periodic = true, it is an internal call by RocksDB periodically to
|
|
612
631
|
// dump the status.
|
|
@@ -615,6 +634,10 @@ class InternalStats {
|
|
|
615
634
|
// dump the status.
|
|
616
635
|
void DumpCFFileHistogram(std::string* value);
|
|
617
636
|
|
|
637
|
+
void DumpCFMapStatsWriteStall(std::map<std::string, std::string>* value);
|
|
638
|
+
void DumpCFStatsWriteStall(std::string* value,
|
|
639
|
+
uint64_t* total_stall_count = nullptr);
|
|
640
|
+
|
|
618
641
|
Cache* GetBlockCacheForStats();
|
|
619
642
|
Cache* GetBlobCacheForStats();
|
|
620
643
|
|
|
@@ -648,7 +671,7 @@ class InternalStats {
|
|
|
648
671
|
// ColumnFamily-level stats
|
|
649
672
|
CompactionStats comp_stats;
|
|
650
673
|
uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush)
|
|
651
|
-
uint64_t stall_count; //
|
|
674
|
+
uint64_t stall_count; // Total counts of CF-scope write stalls
|
|
652
675
|
// Stats from compaction jobs - bytes written, bytes read, duration.
|
|
653
676
|
uint64_t compact_bytes_write;
|
|
654
677
|
uint64_t compact_bytes_read;
|
|
@@ -743,9 +766,15 @@ class InternalStats {
|
|
|
743
766
|
bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
|
|
744
767
|
bool HandleCFFileHistogram(std::string* value, Slice suffix);
|
|
745
768
|
bool HandleCFStatsPeriodic(std::string* value, Slice suffix);
|
|
769
|
+
bool HandleCFWriteStallStats(std::string* value, Slice suffix);
|
|
770
|
+
bool HandleCFWriteStallStatsMap(std::map<std::string, std::string>* values,
|
|
771
|
+
Slice suffix);
|
|
746
772
|
bool HandleDBMapStats(std::map<std::string, std::string>* compaction_stats,
|
|
747
773
|
Slice suffix);
|
|
748
774
|
bool HandleDBStats(std::string* value, Slice suffix);
|
|
775
|
+
bool HandleDBWriteStallStats(std::string* value, Slice suffix);
|
|
776
|
+
bool HandleDBWriteStallStatsMap(std::map<std::string, std::string>* values,
|
|
777
|
+
Slice suffix);
|
|
749
778
|
bool HandleSsTables(std::string* value, Slice suffix);
|
|
750
779
|
bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
|
|
751
780
|
bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
|
|
@@ -1078,7 +1078,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
|
|
1078
1078
|
*(s->value) = std::move(result);
|
|
1079
1079
|
} else {
|
|
1080
1080
|
assert(s->columns);
|
|
1081
|
-
s->columns->SetPlainValue(result);
|
|
1081
|
+
s->columns->SetPlainValue(std::move(result));
|
|
1082
1082
|
}
|
|
1083
1083
|
}
|
|
1084
1084
|
}
|
|
@@ -1152,7 +1152,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
|
|
1152
1152
|
/* op_failure_scope */ nullptr);
|
|
1153
1153
|
|
|
1154
1154
|
if (s->status->ok()) {
|
|
1155
|
-
*(s->status) = s->columns->SetWideColumnValue(result);
|
|
1155
|
+
*(s->status) = s->columns->SetWideColumnValue(std::move(result));
|
|
1156
1156
|
}
|
|
1157
1157
|
}
|
|
1158
1158
|
} else if (s->value) {
|
|
@@ -1200,7 +1200,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
|
|
1200
1200
|
*(s->value) = std::move(result);
|
|
1201
1201
|
} else {
|
|
1202
1202
|
assert(s->columns);
|
|
1203
|
-
s->columns->SetPlainValue(result);
|
|
1203
|
+
s->columns->SetPlainValue(std::move(result));
|
|
1204
1204
|
}
|
|
1205
1205
|
}
|
|
1206
1206
|
} else {
|
|
@@ -1230,6 +1230,8 @@ static bool SaveValue(void* arg, const char* entry) {
|
|
|
1230
1230
|
*(s->merge_in_progress) = true;
|
|
1231
1231
|
merge_context->PushOperand(
|
|
1232
1232
|
v, s->inplace_update_support == false /* operand_pinned */);
|
|
1233
|
+
PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1);
|
|
1234
|
+
|
|
1233
1235
|
if (s->do_merge && merge_operator->ShouldMerge(
|
|
1234
1236
|
merge_context->GetOperandsDirectionBackward())) {
|
|
1235
1237
|
if (s->value || s->columns) {
|
|
@@ -1249,7 +1251,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
|
|
1249
1251
|
*(s->value) = std::move(result);
|
|
1250
1252
|
} else {
|
|
1251
1253
|
assert(s->columns);
|
|
1252
|
-
s->columns->SetPlainValue(result);
|
|
1254
|
+
s->columns->SetPlainValue(std::move(result));
|
|
1253
1255
|
}
|
|
1254
1256
|
}
|
|
1255
1257
|
}
|
|
@@ -231,6 +231,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
|
|
|
231
231
|
s = Status::ShutdownInProgress();
|
|
232
232
|
return s;
|
|
233
233
|
}
|
|
234
|
+
// Skip range tombstones emitted by the compaction iterator.
|
|
235
|
+
if (iter->IsDeleteRangeSentinelKey()) {
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
234
238
|
|
|
235
239
|
ParsedInternalKey ikey;
|
|
236
240
|
assert(keys_.size() == merge_context_.GetNumOperands());
|
|
@@ -964,6 +964,157 @@ TEST_F(PerfContextTest, CPUTimer) {
|
|
|
964
964
|
ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
|
|
965
965
|
}
|
|
966
966
|
}
|
|
967
|
+
|
|
968
|
+
TEST_F(PerfContextTest, MergeOperandCount) {
|
|
969
|
+
ASSERT_OK(DestroyDB(kDbName, Options()));
|
|
970
|
+
|
|
971
|
+
DB* db = nullptr;
|
|
972
|
+
Options options;
|
|
973
|
+
options.create_if_missing = true;
|
|
974
|
+
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
975
|
+
|
|
976
|
+
ASSERT_OK(DB::Open(options, kDbName, &db));
|
|
977
|
+
std::unique_ptr<DB> db_guard(db);
|
|
978
|
+
|
|
979
|
+
constexpr size_t num_keys = 3;
|
|
980
|
+
const std::string key_prefix("key");
|
|
981
|
+
const std::string value_prefix("value");
|
|
982
|
+
|
|
983
|
+
std::vector<std::string> keys;
|
|
984
|
+
keys.reserve(num_keys);
|
|
985
|
+
|
|
986
|
+
for (size_t i = 0; i < num_keys; ++i) {
|
|
987
|
+
keys.emplace_back(key_prefix + std::to_string(i));
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
// Write three keys with one Put each followed by 1, 2, and 3
|
|
991
|
+
// Merge operations respectively.
|
|
992
|
+
constexpr size_t total_merges = num_keys * (num_keys + 1) / 2;
|
|
993
|
+
|
|
994
|
+
std::vector<ManagedSnapshot> snapshots;
|
|
995
|
+
snapshots.reserve(total_merges);
|
|
996
|
+
|
|
997
|
+
for (size_t i = 0; i < num_keys; ++i) {
|
|
998
|
+
const std::string suffix = std::to_string(i);
|
|
999
|
+
const std::string value = value_prefix + suffix;
|
|
1000
|
+
|
|
1001
|
+
ASSERT_OK(db->Put(WriteOptions(), keys[i], value));
|
|
1002
|
+
|
|
1003
|
+
for (size_t j = 0; j <= i; ++j) {
|
|
1004
|
+
// Take a snapshot before each Merge so they are preserved and not
|
|
1005
|
+
// collapsed during flush.
|
|
1006
|
+
snapshots.emplace_back(db);
|
|
1007
|
+
|
|
1008
|
+
ASSERT_OK(db->Merge(WriteOptions(), keys[i], value + std::to_string(j)));
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
auto verify = [&]() {
|
|
1013
|
+
get_perf_context()->Reset();
|
|
1014
|
+
|
|
1015
|
+
for (size_t i = 0; i < num_keys; ++i) {
|
|
1016
|
+
// Get
|
|
1017
|
+
{
|
|
1018
|
+
PinnableSlice result;
|
|
1019
|
+
ASSERT_OK(db->Get(ReadOptions(), db->DefaultColumnFamily(), keys[i],
|
|
1020
|
+
&result));
|
|
1021
|
+
ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, i + 1);
|
|
1022
|
+
|
|
1023
|
+
get_perf_context()->Reset();
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
// GetEntity
|
|
1027
|
+
{
|
|
1028
|
+
PinnableWideColumns result;
|
|
1029
|
+
ASSERT_OK(db->GetEntity(ReadOptions(), db->DefaultColumnFamily(),
|
|
1030
|
+
keys[i], &result));
|
|
1031
|
+
ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, i + 1);
|
|
1032
|
+
|
|
1033
|
+
get_perf_context()->Reset();
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
{
|
|
1038
|
+
std::vector<Slice> key_slices;
|
|
1039
|
+
key_slices.reserve(num_keys);
|
|
1040
|
+
|
|
1041
|
+
for (size_t i = 0; i < num_keys; ++i) {
|
|
1042
|
+
key_slices.emplace_back(keys[i]);
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// MultiGet
|
|
1046
|
+
{
|
|
1047
|
+
std::vector<PinnableSlice> results(num_keys);
|
|
1048
|
+
std::vector<Status> statuses(num_keys);
|
|
1049
|
+
|
|
1050
|
+
db->MultiGet(ReadOptions(), db->DefaultColumnFamily(), num_keys,
|
|
1051
|
+
&key_slices[0], &results[0], &statuses[0]);
|
|
1052
|
+
|
|
1053
|
+
for (size_t i = 0; i < num_keys; ++i) {
|
|
1054
|
+
ASSERT_OK(statuses[i]);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count,
|
|
1058
|
+
total_merges);
|
|
1059
|
+
|
|
1060
|
+
get_perf_context()->Reset();
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
// MultiGetEntity
|
|
1064
|
+
{
|
|
1065
|
+
std::vector<PinnableWideColumns> results(num_keys);
|
|
1066
|
+
std::vector<Status> statuses(num_keys);
|
|
1067
|
+
|
|
1068
|
+
db->MultiGetEntity(ReadOptions(), db->DefaultColumnFamily(), num_keys,
|
|
1069
|
+
&key_slices[0], &results[0], &statuses[0]);
|
|
1070
|
+
|
|
1071
|
+
for (size_t i = 0; i < num_keys; ++i) {
|
|
1072
|
+
ASSERT_OK(statuses[i]);
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count,
|
|
1076
|
+
total_merges);
|
|
1077
|
+
|
|
1078
|
+
get_perf_context()->Reset();
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
std::unique_ptr<Iterator> it(db->NewIterator(ReadOptions()));
|
|
1083
|
+
|
|
1084
|
+
// Forward iteration
|
|
1085
|
+
{
|
|
1086
|
+
size_t i = 0;
|
|
1087
|
+
|
|
1088
|
+
for (it->SeekToFirst(); it->Valid(); it->Next(), ++i) {
|
|
1089
|
+
ASSERT_EQ(it->key(), keys[i]);
|
|
1090
|
+
ASSERT_EQ(get_perf_context()->internal_merge_count, i + 1);
|
|
1091
|
+
|
|
1092
|
+
get_perf_context()->Reset();
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
// Backward iteration
|
|
1097
|
+
{
|
|
1098
|
+
size_t i = num_keys - 1;
|
|
1099
|
+
|
|
1100
|
+
for (it->SeekToLast(); it->Valid(); it->Prev(), --i) {
|
|
1101
|
+
ASSERT_EQ(it->key(), keys[i]);
|
|
1102
|
+
ASSERT_EQ(get_perf_context()->internal_merge_count, i + 1);
|
|
1103
|
+
|
|
1104
|
+
get_perf_context()->Reset();
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
};
|
|
1108
|
+
|
|
1109
|
+
// Verify counters when reading from memtable
|
|
1110
|
+
verify();
|
|
1111
|
+
|
|
1112
|
+
// Verify counters when reading from table files
|
|
1113
|
+
db->Flush(FlushOptions());
|
|
1114
|
+
|
|
1115
|
+
verify();
|
|
1116
|
+
}
|
|
1117
|
+
|
|
967
1118
|
} // namespace ROCKSDB_NAMESPACE
|
|
968
1119
|
|
|
969
1120
|
int main(int argc, char** argv) {
|
|
@@ -30,12 +30,15 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
|
|
|
30
30
|
icmp_(icmp),
|
|
31
31
|
smallest_ikey_(smallest),
|
|
32
32
|
largest_ikey_(largest) {
|
|
33
|
+
// Set up bounds such that range tombstones from this iterator are
|
|
34
|
+
// truncated to range [smallest_, largest_).
|
|
33
35
|
if (smallest != nullptr) {
|
|
34
36
|
pinned_bounds_.emplace_back();
|
|
35
37
|
auto& parsed_smallest = pinned_bounds_.back();
|
|
36
38
|
Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
|
|
37
39
|
false /* log_err_key */); // TODO
|
|
38
40
|
pik_status.PermitUncheckedError();
|
|
41
|
+
parsed_smallest.type = kTypeMaxValid;
|
|
39
42
|
assert(pik_status.ok());
|
|
40
43
|
smallest_ = &parsed_smallest;
|
|
41
44
|
}
|
|
@@ -63,6 +66,8 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
|
|
|
63
66
|
//
|
|
64
67
|
// Therefore, we will never truncate a range tombstone at largest, so we
|
|
65
68
|
// can leave it unchanged.
|
|
69
|
+
// TODO: maybe use kMaxValid here to ensure range tombstone having
|
|
70
|
+
// distinct key from point keys.
|
|
66
71
|
} else {
|
|
67
72
|
// The same user key may straddle two sstable boundaries. To ensure that
|
|
68
73
|
// the truncated end key can cover the largest key in this sstable, reduce
|
|
@@ -70,7 +75,7 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
|
|
|
70
75
|
parsed_largest.sequence -= 1;
|
|
71
76
|
// This line is not needed for correctness, but it ensures that the
|
|
72
77
|
// truncated end key is not covering keys from the next SST file.
|
|
73
|
-
parsed_largest.type =
|
|
78
|
+
parsed_largest.type = kTypeMaxValid;
|
|
74
79
|
}
|
|
75
80
|
largest_ = &parsed_largest;
|
|
76
81
|
}
|
|
@@ -101,6 +106,24 @@ void TruncatedRangeDelIterator::Seek(const Slice& target) {
|
|
|
101
106
|
iter_->Seek(target);
|
|
102
107
|
}
|
|
103
108
|
|
|
109
|
+
void TruncatedRangeDelIterator::SeekInternalKey(const Slice& target) {
|
|
110
|
+
if (largest_ && icmp_->Compare(*largest_, target) <= 0) {
|
|
111
|
+
iter_->Invalidate();
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
if (smallest_ && icmp_->Compare(target, *smallest_) < 0) {
|
|
115
|
+
// Since target < smallest, target < largest_.
|
|
116
|
+
// This seek must land on a range tombstone where end_key() > target,
|
|
117
|
+
// so there is no need to check again.
|
|
118
|
+
iter_->Seek(smallest_->user_key);
|
|
119
|
+
} else {
|
|
120
|
+
iter_->Seek(ExtractUserKey(target));
|
|
121
|
+
while (Valid() && icmp_->Compare(end_key(), target) <= 0) {
|
|
122
|
+
Next();
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
104
127
|
// NOTE: target is a user key, with timestamp if enabled.
|
|
105
128
|
void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
|
|
106
129
|
if (smallest_ != nullptr &&
|
|
@@ -393,21 +416,20 @@ bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
|
|
|
393
416
|
namespace {
|
|
394
417
|
|
|
395
418
|
// Produce a sorted (by start internal key) stream of range tombstones from
|
|
396
|
-
// `children`. lower_bound and upper_bound on
|
|
419
|
+
// `children`. lower_bound and upper_bound on internal key can be
|
|
397
420
|
// optionally specified. Range tombstones that ends before lower_bound or starts
|
|
398
421
|
// after upper_bound are excluded.
|
|
399
422
|
// If user-defined timestamp is enabled, lower_bound and upper_bound should
|
|
400
|
-
// contain timestamp
|
|
423
|
+
// contain timestamp.
|
|
401
424
|
class TruncatedRangeDelMergingIter : public InternalIterator {
|
|
402
425
|
public:
|
|
403
426
|
TruncatedRangeDelMergingIter(
|
|
404
427
|
const InternalKeyComparator* icmp, const Slice* lower_bound,
|
|
405
|
-
const Slice* upper_bound,
|
|
428
|
+
const Slice* upper_bound,
|
|
406
429
|
const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
|
|
407
430
|
: icmp_(icmp),
|
|
408
431
|
lower_bound_(lower_bound),
|
|
409
432
|
upper_bound_(upper_bound),
|
|
410
|
-
upper_bound_inclusive_(upper_bound_inclusive),
|
|
411
433
|
heap_(StartKeyMinComparator(icmp)),
|
|
412
434
|
ts_sz_(icmp_->user_comparator()->timestamp_size()) {
|
|
413
435
|
for (auto& child : children) {
|
|
@@ -420,7 +442,7 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
|
|
|
420
442
|
}
|
|
421
443
|
|
|
422
444
|
bool Valid() const override {
|
|
423
|
-
return !heap_.empty() &&
|
|
445
|
+
return !heap_.empty() && !AfterEndKey(heap_.top());
|
|
424
446
|
}
|
|
425
447
|
Status status() const override { return Status::OK(); }
|
|
426
448
|
|
|
@@ -428,7 +450,13 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
|
|
|
428
450
|
heap_.clear();
|
|
429
451
|
for (auto& child : children_) {
|
|
430
452
|
if (lower_bound_ != nullptr) {
|
|
431
|
-
child->Seek(*lower_bound_);
|
|
453
|
+
child->Seek(ExtractUserKey(*lower_bound_));
|
|
454
|
+
// Since the above `Seek()` operates on a user key while `lower_bound_`
|
|
455
|
+
// is an internal key, we may need to advance `child` farther for it to
|
|
456
|
+
// be in bounds.
|
|
457
|
+
while (child->Valid() && BeforeStartKey(child)) {
|
|
458
|
+
child->InternalNext();
|
|
459
|
+
}
|
|
432
460
|
} else {
|
|
433
461
|
child->SeekToFirst();
|
|
434
462
|
}
|
|
@@ -481,19 +509,23 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
|
|
|
481
509
|
void SeekToLast() override { assert(false); }
|
|
482
510
|
|
|
483
511
|
private:
|
|
484
|
-
bool
|
|
512
|
+
bool BeforeStartKey(const TruncatedRangeDelIterator* iter) const {
|
|
513
|
+
if (lower_bound_ == nullptr) {
|
|
514
|
+
return false;
|
|
515
|
+
}
|
|
516
|
+
return icmp_->Compare(iter->end_key(), *lower_bound_) <= 0;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
bool AfterEndKey(const TruncatedRangeDelIterator* iter) const {
|
|
485
520
|
if (upper_bound_ == nullptr) {
|
|
486
|
-
return
|
|
521
|
+
return false;
|
|
487
522
|
}
|
|
488
|
-
|
|
489
|
-
iter->start_key().user_key, *upper_bound_);
|
|
490
|
-
return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
|
|
523
|
+
return icmp_->Compare(iter->start_key(), *upper_bound_) > 0;
|
|
491
524
|
}
|
|
492
525
|
|
|
493
526
|
const InternalKeyComparator* icmp_;
|
|
494
527
|
const Slice* lower_bound_;
|
|
495
528
|
const Slice* upper_bound_;
|
|
496
|
-
bool upper_bound_inclusive_;
|
|
497
529
|
BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
|
|
498
530
|
std::vector<TruncatedRangeDelIterator*> children_;
|
|
499
531
|
|
|
@@ -506,11 +538,10 @@ class TruncatedRangeDelMergingIter : public InternalIterator {
|
|
|
506
538
|
|
|
507
539
|
std::unique_ptr<FragmentedRangeTombstoneIterator>
|
|
508
540
|
CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
|
|
509
|
-
const Slice* upper_bound
|
|
510
|
-
bool upper_bound_inclusive) {
|
|
541
|
+
const Slice* upper_bound) {
|
|
511
542
|
InvalidateRangeDelMapPositions();
|
|
512
543
|
auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
|
|
513
|
-
icmp_, lower_bound, upper_bound,
|
|
544
|
+
icmp_, lower_bound, upper_bound, parent_iters_);
|
|
514
545
|
|
|
515
546
|
auto fragmented_tombstone_list =
|
|
516
547
|
std::make_shared<FragmentedRangeTombstoneList>(
|
|
@@ -49,6 +49,9 @@ class TruncatedRangeDelIterator {
|
|
|
49
49
|
// REQUIRES: target is a user key.
|
|
50
50
|
void Seek(const Slice& target);
|
|
51
51
|
|
|
52
|
+
// Seeks to the first range tombstone with end_key() > target.
|
|
53
|
+
void SeekInternalKey(const Slice& target);
|
|
54
|
+
|
|
52
55
|
// Seeks to the tombstone with the highest visible sequence number that covers
|
|
53
56
|
// target (a user key). If no such tombstone exists, the position will be at
|
|
54
57
|
// the latest tombstone that starts before target.
|
|
@@ -452,16 +455,15 @@ class CompactionRangeDelAggregator : public RangeDelAggregator {
|
|
|
452
455
|
}
|
|
453
456
|
|
|
454
457
|
// Creates an iterator over all the range tombstones in the aggregator, for
|
|
455
|
-
// use in compaction.
|
|
456
|
-
//
|
|
457
|
-
// NOTE: the boundaries are used for optimization purposes to
|
|
458
|
-
// number of tombstones that are passed to the fragmenter; they do
|
|
459
|
-
// guarantee that the resulting iterator only contains range tombstones
|
|
460
|
-
// cover keys in the provided range. If required, these bounds must be
|
|
458
|
+
// use in compaction.
|
|
459
|
+
//
|
|
460
|
+
// NOTE: the internal key boundaries are used for optimization purposes to
|
|
461
|
+
// reduce the number of tombstones that are passed to the fragmenter; they do
|
|
462
|
+
// not guarantee that the resulting iterator only contains range tombstones
|
|
463
|
+
// that cover keys in the provided range. If required, these bounds must be
|
|
461
464
|
// enforced during iteration.
|
|
462
465
|
std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
|
|
463
|
-
const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr
|
|
464
|
-
bool upper_bound_inclusive = false);
|
|
466
|
+
const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr);
|
|
465
467
|
|
|
466
468
|
private:
|
|
467
469
|
std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
|