@nxtedition/rocksdb 13.1.4 → 13.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +43 -16
- package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
- package/deps/rocksdb/rocksdb/Makefile +2 -2
- package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
- package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
- package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
- package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
- package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
- package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
- package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
- package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
- package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
- package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
- package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
- package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
- package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
- package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
- package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
- package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
- package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
- package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
- package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
- package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
- package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
- package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
- package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
- package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
- package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
- package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
- package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
- package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
- package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
- package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
- package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
- package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
- package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
- package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
- package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
- package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
- package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
- package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
- package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
- package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
- package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
- package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
- package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
- package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
- package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
- package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
- package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
- package/deps/rocksdb/rocksdb/port/port.h +5 -9
- package/deps/rocksdb/rocksdb/src.mk +8 -0
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/format.cc +3 -3
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
- package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
- package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
- package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
- package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
- package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
- package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
- package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
- package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
- package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
- package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
- package/deps/rocksdb/rocksdb.gyp +2 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
|
|
19
19
|
#include "db/dbformat.h"
|
|
20
20
|
#include "db/kv_checksum.h"
|
|
21
|
+
#include "db/merge_helper.h"
|
|
21
22
|
#include "db/range_tombstone_fragmenter.h"
|
|
22
23
|
#include "db/read_callback.h"
|
|
23
24
|
#include "db/seqno_to_time_mapping.h"
|
|
@@ -76,88 +77,48 @@ struct MemTablePostProcessInfo {
|
|
|
76
77
|
};
|
|
77
78
|
|
|
78
79
|
using MultiGetRange = MultiGetContext::Range;
|
|
79
|
-
|
|
80
|
+
|
|
81
|
+
// For each CF, rocksdb maintains an active memtable that accept writes,
|
|
82
|
+
// and zero or more sealed memtables that we call immutable memtables.
|
|
83
|
+
// This interface contains all methods required for immutable memtables.
|
|
84
|
+
// MemTable class inherit from `ReadOnlyMemTable` and implements additional
|
|
85
|
+
// methods required for active memtables.
|
|
86
|
+
// Immutable memtable list (MemTableList) maintains a list of ReadOnlyMemTable
|
|
87
|
+
// objects. This interface enables feature like direct ingestion of an
|
|
88
|
+
// immutable memtable with custom implementation, bypassing memtable writes.
|
|
89
|
+
//
|
|
90
|
+
// Note: Many of the methods in this class have comments indicating that
|
|
80
91
|
// external synchronization is required as these methods are not thread-safe.
|
|
81
92
|
// It is up to higher layers of code to decide how to prevent concurrent
|
|
82
|
-
// invocation of these methods.
|
|
93
|
+
// invocation of these methods. This is usually done by acquiring either
|
|
83
94
|
// the db mutex or the single writer thread.
|
|
84
95
|
//
|
|
85
96
|
// Some of these methods are documented to only require external
|
|
86
|
-
// synchronization if this memtable is immutable.
|
|
97
|
+
// synchronization if this memtable is immutable. Calling MarkImmutable() is
|
|
87
98
|
// not sufficient to guarantee immutability. It is up to higher layers of
|
|
88
99
|
// code to determine if this MemTable can still be modified by other threads.
|
|
89
100
|
// Eg: The Superversion stores a pointer to the current MemTable (that can
|
|
90
101
|
// be modified) and a separate list of the MemTables that can no longer be
|
|
91
102
|
// written to (aka the 'immutable memtables').
|
|
92
|
-
|
|
103
|
+
//
|
|
104
|
+
// MemTables are reference counted. The initial reference count
|
|
105
|
+
// is zero and the caller must call Ref() at least once.
|
|
106
|
+
class ReadOnlyMemTable {
|
|
93
107
|
public:
|
|
94
|
-
struct KeyComparator : public MemTableRep::KeyComparator {
|
|
95
|
-
const InternalKeyComparator comparator;
|
|
96
|
-
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
|
|
97
|
-
int operator()(const char* prefix_len_key1,
|
|
98
|
-
const char* prefix_len_key2) const override;
|
|
99
|
-
int operator()(const char* prefix_len_key,
|
|
100
|
-
const DecodedType& key) const override;
|
|
101
|
-
};
|
|
102
|
-
|
|
103
|
-
// MemTables are reference counted. The initial reference count
|
|
104
|
-
// is zero and the caller must call Ref() at least once.
|
|
105
|
-
//
|
|
106
|
-
// earliest_seq should be the current SequenceNumber in the db such that any
|
|
107
|
-
// key inserted into this memtable will have an equal or larger seq number.
|
|
108
|
-
// (When a db is first created, the earliest sequence number will be 0).
|
|
109
|
-
// If the earliest sequence number is not known, kMaxSequenceNumber may be
|
|
110
|
-
// used, but this may prevent some transactions from succeeding until the
|
|
111
|
-
// first key is inserted into the memtable.
|
|
112
|
-
explicit MemTable(const InternalKeyComparator& comparator,
|
|
113
|
-
const ImmutableOptions& ioptions,
|
|
114
|
-
const MutableCFOptions& mutable_cf_options,
|
|
115
|
-
WriteBufferManager* write_buffer_manager,
|
|
116
|
-
SequenceNumber earliest_seq, uint32_t column_family_id);
|
|
117
|
-
// No copying allowed
|
|
118
|
-
MemTable(const MemTable&) = delete;
|
|
119
|
-
MemTable& operator=(const MemTable&) = delete;
|
|
120
|
-
|
|
121
108
|
// Do not delete this MemTable unless Unref() indicates it not in use.
|
|
122
|
-
~
|
|
109
|
+
virtual ~ReadOnlyMemTable() = default;
|
|
123
110
|
|
|
124
|
-
|
|
125
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
126
|
-
// operations on the same MemTable.
|
|
127
|
-
void Ref() { ++refs_; }
|
|
128
|
-
|
|
129
|
-
// Drop reference count.
|
|
130
|
-
// If the refcount goes to zero return this memtable, otherwise return null.
|
|
131
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
132
|
-
// operations on the same MemTable.
|
|
133
|
-
MemTable* Unref() {
|
|
134
|
-
--refs_;
|
|
135
|
-
assert(refs_ >= 0);
|
|
136
|
-
if (refs_ <= 0) {
|
|
137
|
-
return this;
|
|
138
|
-
}
|
|
139
|
-
return nullptr;
|
|
140
|
-
}
|
|
111
|
+
virtual const char* Name() const = 0;
|
|
141
112
|
|
|
142
113
|
// Returns an estimate of the number of bytes of data in use by this
|
|
143
114
|
// data structure.
|
|
144
115
|
//
|
|
145
116
|
// REQUIRES: external synchronization to prevent simultaneous
|
|
146
117
|
// operations on the same MemTable (unless this Memtable is immutable).
|
|
147
|
-
size_t ApproximateMemoryUsage();
|
|
148
|
-
|
|
149
|
-
// As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
|
|
150
|
-
// require external synchronization. The value may be less accurate though
|
|
151
|
-
size_t ApproximateMemoryUsageFast() const {
|
|
152
|
-
return approximate_memory_usage_.load(std::memory_order_relaxed);
|
|
153
|
-
}
|
|
118
|
+
virtual size_t ApproximateMemoryUsage() = 0;
|
|
154
119
|
|
|
155
120
|
// used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
|
|
156
|
-
size_t MemoryAllocatedBytes() const
|
|
157
|
-
return table_->ApproximateMemoryUsage() +
|
|
158
|
-
range_del_table_->ApproximateMemoryUsage() +
|
|
159
|
-
arena_.MemoryAllocatedBytes();
|
|
160
|
-
}
|
|
121
|
+
virtual size_t MemoryAllocatedBytes() const = 0;
|
|
161
122
|
|
|
162
123
|
// Returns a vector of unique random memtable entries of size 'sample_size'.
|
|
163
124
|
//
|
|
@@ -172,27 +133,8 @@ class MemTable {
|
|
|
172
133
|
// REQUIRES: SkipList memtable representation. This function is not
|
|
173
134
|
// implemented for any other type of memtable representation (vectorrep,
|
|
174
135
|
// hashskiplist,...).
|
|
175
|
-
void UniqueRandomSample(const uint64_t& target_sample_size,
|
|
176
|
-
|
|
177
|
-
// TODO(bjlemaire): at the moment, only supported by skiplistrep.
|
|
178
|
-
// Extend it to all other memtable representations.
|
|
179
|
-
table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
// This method heuristically determines if the memtable should continue to
|
|
183
|
-
// host more data.
|
|
184
|
-
bool ShouldScheduleFlush() const {
|
|
185
|
-
return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// Returns true if a flush should be scheduled and the caller should
|
|
189
|
-
// be the one to schedule it
|
|
190
|
-
bool MarkFlushScheduled() {
|
|
191
|
-
auto before = FLUSH_REQUESTED;
|
|
192
|
-
return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
|
|
193
|
-
std::memory_order_relaxed,
|
|
194
|
-
std::memory_order_relaxed);
|
|
195
|
-
}
|
|
136
|
+
virtual void UniqueRandomSample(const uint64_t& target_sample_size,
|
|
137
|
+
std::unordered_set<const char*>* entries) = 0;
|
|
196
138
|
|
|
197
139
|
// Return an iterator that yields the contents of the memtable.
|
|
198
140
|
//
|
|
@@ -208,10 +150,18 @@ class MemTable {
|
|
|
208
150
|
// those allocated in arena.
|
|
209
151
|
// seqno_to_time_mapping: it's used to support return write unix time for the
|
|
210
152
|
// data, currently only needed for iterators serving user reads.
|
|
211
|
-
InternalIterator* NewIterator(
|
|
153
|
+
virtual InternalIterator* NewIterator(
|
|
212
154
|
const ReadOptions& read_options,
|
|
213
155
|
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
|
|
214
|
-
const SliceTransform* prefix_extractor);
|
|
156
|
+
const SliceTransform* prefix_extractor, bool for_flush) = 0;
|
|
157
|
+
|
|
158
|
+
// Returns an iterator that wraps a MemTableIterator and logically strips the
|
|
159
|
+
// user-defined timestamp of each key. This API is only used by flush when
|
|
160
|
+
// user-defined timestamps in MemTable only feature is enabled.
|
|
161
|
+
virtual InternalIterator* NewTimestampStrippingIterator(
|
|
162
|
+
const ReadOptions& read_options,
|
|
163
|
+
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
|
|
164
|
+
const SliceTransform* prefix_extractor, size_t ts_sz) = 0;
|
|
215
165
|
|
|
216
166
|
// Returns an iterator that yields the range tombstones of the memtable.
|
|
217
167
|
// The caller must ensure that the underlying MemTable remains live
|
|
@@ -223,31 +173,23 @@ class MemTable {
|
|
|
223
173
|
// is constructed when a memtable becomes immutable. Setting the flag to false
|
|
224
174
|
// will always yield correct result, but may incur performance penalty as it
|
|
225
175
|
// always creates a new fragmented range tombstone list.
|
|
226
|
-
FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
|
|
176
|
+
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
|
|
227
177
|
const ReadOptions& read_options, SequenceNumber read_seq,
|
|
228
|
-
bool immutable_memtable);
|
|
178
|
+
bool immutable_memtable) = 0;
|
|
229
179
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
//
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
// simultaneous operations on the same MemTable.
|
|
239
|
-
//
|
|
240
|
-
// Returns `Status::TryAgain` if the `seq`, `key` combination already exists
|
|
241
|
-
// in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
|
|
242
|
-
// The next attempt should try a larger value for `seq`.
|
|
243
|
-
Status Add(SequenceNumber seq, ValueType type, const Slice& key,
|
|
244
|
-
const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
|
|
245
|
-
bool allow_concurrent = false,
|
|
246
|
-
MemTablePostProcessInfo* post_process_info = nullptr,
|
|
247
|
-
void** hint = nullptr);
|
|
180
|
+
// Returns an iterator that yields the range tombstones of the memtable and
|
|
181
|
+
// logically strips the user-defined timestamp of each key (including start
|
|
182
|
+
// key, and end key). This API is only used by flush when user-defined
|
|
183
|
+
// timestamps in MemTable only feature is enabled.
|
|
184
|
+
virtual FragmentedRangeTombstoneIterator*
|
|
185
|
+
NewTimestampStrippingRangeTombstoneIterator(const ReadOptions& read_options,
|
|
186
|
+
SequenceNumber read_seq,
|
|
187
|
+
size_t ts_sz) = 0;
|
|
248
188
|
|
|
249
189
|
// Used to Get value associated with key or Get Merge Operands associated
|
|
250
190
|
// with key.
|
|
191
|
+
// Keys are considered if they are no larger than the parameter `key` in
|
|
192
|
+
// the order defined by comparator and share the save user key with `key`.
|
|
251
193
|
// If do_merge = true the default behavior which is Get value for key is
|
|
252
194
|
// executed. Expected behavior is described right below.
|
|
253
195
|
// If memtable contains a value for key, store it in *value and return true.
|
|
@@ -276,14 +218,13 @@ class MemTable {
|
|
|
276
218
|
// @param immutable_memtable Whether this memtable is immutable. Used
|
|
277
219
|
// internally by NewRangeTombstoneIterator(). See comment above
|
|
278
220
|
// NewRangeTombstoneIterator() for more detail.
|
|
279
|
-
bool Get(const LookupKey& key, std::string* value,
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
221
|
+
virtual bool Get(const LookupKey& key, std::string* value,
|
|
222
|
+
PinnableWideColumns* columns, std::string* timestamp,
|
|
223
|
+
Status* s, MergeContext* merge_context,
|
|
224
|
+
SequenceNumber* max_covering_tombstone_seq,
|
|
225
|
+
SequenceNumber* seq, const ReadOptions& read_opts,
|
|
226
|
+
bool immutable_memtable, ReadCallback* callback = nullptr,
|
|
227
|
+
bool* is_blob_index = nullptr, bool do_merge = true) = 0;
|
|
287
228
|
bool Get(const LookupKey& key, std::string* value,
|
|
288
229
|
PinnableWideColumns* columns, std::string* timestamp, Status* s,
|
|
289
230
|
MergeContext* merge_context,
|
|
@@ -300,8 +241,351 @@ class MemTable {
|
|
|
300
241
|
// @param immutable_memtable Whether this memtable is immutable. Used
|
|
301
242
|
// internally by NewRangeTombstoneIterator(). See comment above
|
|
302
243
|
// NewRangeTombstoneIterator() for more detail.
|
|
244
|
+
virtual void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
245
|
+
ReadCallback* callback, bool immutable_memtable) = 0;
|
|
246
|
+
|
|
247
|
+
// Get total number of entries in the mem table.
|
|
248
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
249
|
+
// operations on the same MemTable (unless this Memtable is immutable).
|
|
250
|
+
virtual uint64_t NumEntries() const = 0;
|
|
251
|
+
|
|
252
|
+
// Get total number of point deletes in the mem table.
|
|
253
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
254
|
+
// operations on the same MemTable (unless this Memtable is immutable).
|
|
255
|
+
virtual uint64_t NumDeletion() const = 0;
|
|
256
|
+
|
|
257
|
+
// Get total number of range deletions in the mem table.
|
|
258
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
259
|
+
// operations on the same MemTable (unless this Memtable is immutable).
|
|
260
|
+
virtual uint64_t NumRangeDeletion() const = 0;
|
|
261
|
+
|
|
262
|
+
virtual uint64_t GetDataSize() const = 0;
|
|
263
|
+
|
|
264
|
+
// Returns the sequence number of the first element that was inserted
|
|
265
|
+
// into the memtable.
|
|
266
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
267
|
+
// operations on the same MemTable (unless this Memtable is immutable).
|
|
268
|
+
virtual SequenceNumber GetFirstSequenceNumber() = 0;
|
|
269
|
+
|
|
270
|
+
// Returns if there is no entry inserted to the mem table.
|
|
271
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
272
|
+
// operations on the same MemTable (unless this Memtable is immutable).
|
|
273
|
+
virtual bool IsEmpty() const = 0;
|
|
274
|
+
|
|
275
|
+
// Returns the sequence number that is guaranteed to be smaller than or equal
|
|
276
|
+
// to the sequence number of any key that could be inserted into this
|
|
277
|
+
// memtable. It can then be assumed that any write with a larger(or equal)
|
|
278
|
+
// sequence number will be present in this memtable or a later memtable.
|
|
279
|
+
//
|
|
280
|
+
// If the earliest sequence number could not be determined,
|
|
281
|
+
// kMaxSequenceNumber will be returned.
|
|
282
|
+
virtual SequenceNumber GetEarliestSequenceNumber() = 0;
|
|
283
|
+
|
|
284
|
+
virtual uint64_t GetMinLogContainingPrepSection() = 0;
|
|
285
|
+
|
|
286
|
+
// Notify the underlying storage that no more items will be added.
|
|
287
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
288
|
+
// operations on the same MemTable.
|
|
289
|
+
// After MarkImmutable() is called, you should not attempt to
|
|
290
|
+
// write anything to this MemTable(). (Ie. do not call Add() or Update()).
|
|
291
|
+
virtual void MarkImmutable() = 0;
|
|
292
|
+
|
|
293
|
+
// Notify the underlying storage that all data it contained has been
|
|
294
|
+
// persisted.
|
|
295
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
296
|
+
// operations on the same MemTable.
|
|
297
|
+
virtual void MarkFlushed() = 0;
|
|
298
|
+
|
|
299
|
+
struct MemTableStats {
|
|
300
|
+
uint64_t size;
|
|
301
|
+
uint64_t count;
|
|
302
|
+
};
|
|
303
|
+
virtual MemTableStats ApproximateStats(const Slice& start_ikey,
|
|
304
|
+
const Slice& end_ikey) = 0;
|
|
305
|
+
|
|
306
|
+
virtual const InternalKeyComparator& GetInternalKeyComparator() const = 0;
|
|
307
|
+
|
|
308
|
+
virtual uint64_t ApproximateOldestKeyTime() const = 0;
|
|
309
|
+
|
|
310
|
+
// Returns whether a fragmented range tombstone list is already constructed
|
|
311
|
+
// for this memtable. It should be constructed right before a memtable is
|
|
312
|
+
// added to an immutable memtable list. Note that if a memtable does not have
|
|
313
|
+
// any range tombstone, then no range tombstone list will ever be constructed
|
|
314
|
+
// and true is returned in that case.
|
|
315
|
+
virtual bool IsFragmentedRangeTombstonesConstructed() const = 0;
|
|
316
|
+
|
|
317
|
+
// Get the newest user-defined timestamp contained in this MemTable. Check
|
|
318
|
+
// `newest_udt_` for what newer means. This method should only be invoked for
|
|
319
|
+
// an MemTable that has enabled user-defined timestamp feature and set
|
|
320
|
+
// `persist_user_defined_timestamps` to false. The tracked newest UDT will be
|
|
321
|
+
// used by flush job in the background to help check the MemTable's
|
|
322
|
+
// eligibility for Flush.
|
|
323
|
+
virtual const Slice& GetNewestUDT() const = 0;
|
|
324
|
+
|
|
325
|
+
// Increase reference count.
|
|
326
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
327
|
+
// operations on the same MemTable.
|
|
328
|
+
void Ref() { ++refs_; }
|
|
329
|
+
|
|
330
|
+
// Drop reference count.
|
|
331
|
+
// If the refcount goes to zero return this memtable, otherwise return null.
|
|
332
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
333
|
+
// operations on the same MemTable.
|
|
334
|
+
ReadOnlyMemTable* Unref() {
|
|
335
|
+
--refs_;
|
|
336
|
+
assert(refs_ >= 0);
|
|
337
|
+
if (refs_ <= 0) {
|
|
338
|
+
return this;
|
|
339
|
+
}
|
|
340
|
+
return nullptr;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Returns the edits area that is needed for flushing the memtable
|
|
344
|
+
VersionEdit* GetEdits() { return &edit_; }
|
|
345
|
+
|
|
346
|
+
// Returns the next active logfile number when this memtable is about to
|
|
347
|
+
// be flushed to storage
|
|
348
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
349
|
+
// operations on the same MemTable.
|
|
350
|
+
uint64_t GetNextLogNumber() const { return mem_next_logfile_number_; }
|
|
351
|
+
|
|
352
|
+
// Sets the next active logfile number when this memtable is about to
|
|
353
|
+
// be flushed to storage
|
|
354
|
+
// REQUIRES: external synchronization to prevent simultaneous
|
|
355
|
+
// operations on the same MemTable.
|
|
356
|
+
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
|
|
357
|
+
|
|
358
|
+
// REQUIRES: db_mutex held.
|
|
359
|
+
void SetID(uint64_t id) { id_ = id; }
|
|
360
|
+
|
|
361
|
+
uint64_t GetID() const { return id_; }
|
|
362
|
+
|
|
363
|
+
void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
|
|
364
|
+
|
|
365
|
+
uint64_t GetFileNumber() const { return file_number_; }
|
|
366
|
+
|
|
367
|
+
void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
|
|
368
|
+
|
|
369
|
+
void SetFlushInProgress(bool in_progress) {
|
|
370
|
+
flush_in_progress_ = in_progress;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
|
|
374
|
+
flush_job_info_ = std::move(info);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
|
|
378
|
+
return std::move(flush_job_info_);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
static void HandleTypeValue(
|
|
382
|
+
const Slice& lookup_user_key, const Slice& value, bool value_pinned,
|
|
383
|
+
bool do_merge, bool merge_in_progress, MergeContext* merge_context,
|
|
384
|
+
const MergeOperator* merge_operator, SystemClock* clock,
|
|
385
|
+
Statistics* statistics, Logger* info_log, Status* s,
|
|
386
|
+
std::string* out_value, PinnableWideColumns* out_columns,
|
|
387
|
+
bool* is_blob_index) {
|
|
388
|
+
*s = Status::OK();
|
|
389
|
+
|
|
390
|
+
if (!do_merge) {
|
|
391
|
+
// Preserve the value with the goal of returning it as part of
|
|
392
|
+
// raw merge operands to the user
|
|
393
|
+
// TODO(yanqin) update MergeContext so that timestamps information
|
|
394
|
+
// can also be retained.
|
|
395
|
+
merge_context->PushOperand(value, value_pinned);
|
|
396
|
+
} else if (merge_in_progress) {
|
|
397
|
+
assert(do_merge);
|
|
398
|
+
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
399
|
+
// nullptr) since a failure must be propagated regardless of its
|
|
400
|
+
// value.
|
|
401
|
+
if (out_value || out_columns) {
|
|
402
|
+
*s = MergeHelper::TimedFullMerge(
|
|
403
|
+
merge_operator, lookup_user_key, MergeHelper::kPlainBaseValue,
|
|
404
|
+
value, merge_context->GetOperands(), info_log, statistics, clock,
|
|
405
|
+
/* update_num_ops_stats */ true,
|
|
406
|
+
/* op_failure_scope */ nullptr, out_value, out_columns);
|
|
407
|
+
}
|
|
408
|
+
} else if (out_value) {
|
|
409
|
+
out_value->assign(value.data(), value.size());
|
|
410
|
+
} else if (out_columns) {
|
|
411
|
+
out_columns->SetPlainValue(value);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
if (is_blob_index) {
|
|
415
|
+
*is_blob_index = false;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
static void HandleTypeDeletion(
|
|
420
|
+
const Slice& lookup_user_key, bool merge_in_progress,
|
|
421
|
+
MergeContext* merge_context, const MergeOperator* merge_operator,
|
|
422
|
+
SystemClock* clock, Statistics* statistics, Logger* logger, Status* s,
|
|
423
|
+
std::string* out_value, PinnableWideColumns* out_columns) {
|
|
424
|
+
if (merge_in_progress) {
|
|
425
|
+
if (out_value || out_columns) {
|
|
426
|
+
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
427
|
+
// nullptr) since a failure must be propagated regardless of its
|
|
428
|
+
// value.
|
|
429
|
+
*s = MergeHelper::TimedFullMerge(
|
|
430
|
+
merge_operator, lookup_user_key, MergeHelper::kNoBaseValue,
|
|
431
|
+
merge_context->GetOperands(), logger, statistics, clock,
|
|
432
|
+
/* update_num_ops_stats */ true,
|
|
433
|
+
/* op_failure_scope */ nullptr, out_value, out_columns);
|
|
434
|
+
} else {
|
|
435
|
+
// We have found a final value (a base deletion) and have newer
|
|
436
|
+
// merge operands that we do not intend to merge. Nothing remains
|
|
437
|
+
// to be done so assign status to OK.
|
|
438
|
+
*s = Status::OK();
|
|
439
|
+
}
|
|
440
|
+
} else {
|
|
441
|
+
*s = Status::NotFound();
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
protected:
|
|
446
|
+
friend class MemTableList;
|
|
447
|
+
|
|
448
|
+
int refs_{0};
|
|
449
|
+
|
|
450
|
+
// These are used to manage memtable flushes to storage
|
|
451
|
+
bool flush_in_progress_{false}; // started the flush
|
|
452
|
+
bool flush_completed_{false}; // finished the flush
|
|
453
|
+
uint64_t file_number_{0};
|
|
454
|
+
|
|
455
|
+
// The updates to be applied to the transaction log when this
|
|
456
|
+
// memtable is flushed to storage.
|
|
457
|
+
VersionEdit edit_;
|
|
458
|
+
|
|
459
|
+
// The log files earlier than this number can be deleted.
|
|
460
|
+
uint64_t mem_next_logfile_number_{0};
|
|
461
|
+
|
|
462
|
+
// Memtable id to track flush.
|
|
463
|
+
uint64_t id_ = 0;
|
|
464
|
+
|
|
465
|
+
// Sequence number of the atomic flush that is responsible for this memtable.
|
|
466
|
+
// The sequence number of atomic flush is a seq, such that no writes with
|
|
467
|
+
// sequence numbers greater than or equal to seq are flushed, while all
|
|
468
|
+
// writes with sequence number smaller than seq are flushed.
|
|
469
|
+
SequenceNumber atomic_flush_seqno_{kMaxSequenceNumber};
|
|
470
|
+
|
|
471
|
+
// Flush job info of the current memtable.
|
|
472
|
+
std::unique_ptr<FlushJobInfo> flush_job_info_;
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
class MemTable final : public ReadOnlyMemTable {
|
|
476
|
+
public:
|
|
477
|
+
struct KeyComparator final : public MemTableRep::KeyComparator {
|
|
478
|
+
const InternalKeyComparator comparator;
|
|
479
|
+
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
|
|
480
|
+
int operator()(const char* prefix_len_key1,
|
|
481
|
+
const char* prefix_len_key2) const override;
|
|
482
|
+
int operator()(const char* prefix_len_key,
|
|
483
|
+
const DecodedType& key) const override;
|
|
484
|
+
};
|
|
485
|
+
|
|
486
|
+
// earliest_seq should be the current SequenceNumber in the db such that any
|
|
487
|
+
// key inserted into this memtable will have an equal or larger seq number.
|
|
488
|
+
// (When a db is first created, the earliest sequence number will be 0).
|
|
489
|
+
// If the earliest sequence number is not known, kMaxSequenceNumber may be
|
|
490
|
+
// used, but this may prevent some transactions from succeeding until the
|
|
491
|
+
// first key is inserted into the memtable.
|
|
492
|
+
explicit MemTable(const InternalKeyComparator& comparator,
|
|
493
|
+
const ImmutableOptions& ioptions,
|
|
494
|
+
const MutableCFOptions& mutable_cf_options,
|
|
495
|
+
WriteBufferManager* write_buffer_manager,
|
|
496
|
+
SequenceNumber earliest_seq, uint32_t column_family_id);
|
|
497
|
+
// No copying allowed
|
|
498
|
+
MemTable(const MemTable&) = delete;
|
|
499
|
+
MemTable& operator=(const MemTable&) = delete;
|
|
500
|
+
|
|
501
|
+
~MemTable() override;
|
|
502
|
+
|
|
503
|
+
const char* Name() const override { return "MemTable"; }
|
|
504
|
+
|
|
505
|
+
size_t ApproximateMemoryUsage() override;
|
|
506
|
+
|
|
507
|
+
// As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
|
|
508
|
+
// require external synchronization. The value may be less accurate though
|
|
509
|
+
size_t ApproximateMemoryUsageFast() const {
|
|
510
|
+
return approximate_memory_usage_.load(std::memory_order_relaxed);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
size_t MemoryAllocatedBytes() const override {
|
|
514
|
+
return table_->ApproximateMemoryUsage() +
|
|
515
|
+
range_del_table_->ApproximateMemoryUsage() +
|
|
516
|
+
arena_.MemoryAllocatedBytes();
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
void UniqueRandomSample(const uint64_t& target_sample_size,
|
|
520
|
+
std::unordered_set<const char*>* entries) override {
|
|
521
|
+
// TODO(bjlemaire): at the moment, only supported by skiplistrep.
|
|
522
|
+
// Extend it to all other memtable representations.
|
|
523
|
+
table_->UniqueRandomSample(NumEntries(), target_sample_size, entries);
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// This method heuristically determines if the memtable should continue to
|
|
527
|
+
// host more data.
|
|
528
|
+
bool ShouldScheduleFlush() const {
|
|
529
|
+
return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// Returns true if a flush should be scheduled and the caller should
|
|
533
|
+
// be the one to schedule it
|
|
534
|
+
bool MarkFlushScheduled() {
|
|
535
|
+
auto before = FLUSH_REQUESTED;
|
|
536
|
+
return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
|
|
537
|
+
std::memory_order_relaxed,
|
|
538
|
+
std::memory_order_relaxed);
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
InternalIterator* NewIterator(
|
|
542
|
+
const ReadOptions& read_options,
|
|
543
|
+
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
|
|
544
|
+
const SliceTransform* prefix_extractor, bool for_flush) override;
|
|
545
|
+
|
|
546
|
+
InternalIterator* NewTimestampStrippingIterator(
|
|
547
|
+
const ReadOptions& read_options,
|
|
548
|
+
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
|
|
549
|
+
const SliceTransform* prefix_extractor, size_t ts_sz) override;
|
|
550
|
+
|
|
551
|
+
FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
|
|
552
|
+
const ReadOptions& read_options, SequenceNumber read_seq,
|
|
553
|
+
bool immutable_memtable) override;
|
|
554
|
+
|
|
555
|
+
FragmentedRangeTombstoneIterator* NewTimestampStrippingRangeTombstoneIterator(
|
|
556
|
+
const ReadOptions& read_options, SequenceNumber read_seq,
|
|
557
|
+
size_t ts_sz) override;
|
|
558
|
+
|
|
559
|
+
Status VerifyEncodedEntry(Slice encoded,
|
|
560
|
+
const ProtectionInfoKVOS64& kv_prot_info);
|
|
561
|
+
|
|
562
|
+
// Add an entry into memtable that maps key to value at the
|
|
563
|
+
// specified sequence number and with the specified type.
|
|
564
|
+
// Typically, value will be empty if type==kTypeDeletion.
|
|
565
|
+
//
|
|
566
|
+
// REQUIRES: if allow_concurrent = false, external synchronization to prevent
|
|
567
|
+
// simultaneous operations on the same MemTable.
|
|
568
|
+
//
|
|
569
|
+
// Returns `Status::TryAgain` if the `seq`, `key` combination already exists
|
|
570
|
+
// in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
|
|
571
|
+
// The next attempt should try a larger value for `seq`.
|
|
572
|
+
Status Add(SequenceNumber seq, ValueType type, const Slice& key,
|
|
573
|
+
const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
|
|
574
|
+
bool allow_concurrent = false,
|
|
575
|
+
MemTablePostProcessInfo* post_process_info = nullptr,
|
|
576
|
+
void** hint = nullptr);
|
|
577
|
+
|
|
578
|
+
using ReadOnlyMemTable::Get;
|
|
579
|
+
bool Get(const LookupKey& key, std::string* value,
|
|
580
|
+
PinnableWideColumns* columns, std::string* timestamp, Status* s,
|
|
581
|
+
MergeContext* merge_context,
|
|
582
|
+
SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
|
|
583
|
+
const ReadOptions& read_opts, bool immutable_memtable,
|
|
584
|
+
ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
|
|
585
|
+
bool do_merge = true) override;
|
|
586
|
+
|
|
303
587
|
void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
304
|
-
ReadCallback* callback, bool immutable_memtable);
|
|
588
|
+
ReadCallback* callback, bool immutable_memtable) override;
|
|
305
589
|
|
|
306
590
|
// If `key` exists in current memtable with type value_type and the existing
|
|
307
591
|
// value is at least as large as the new value, updates it in-place. Otherwise
|
|
@@ -357,28 +641,19 @@ class MemTable {
|
|
|
357
641
|
UpdateFlushState();
|
|
358
642
|
}
|
|
359
643
|
|
|
360
|
-
|
|
361
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
362
|
-
// operations on the same MemTable (unless this Memtable is immutable).
|
|
363
|
-
uint64_t num_entries() const {
|
|
644
|
+
uint64_t NumEntries() const override {
|
|
364
645
|
return num_entries_.load(std::memory_order_relaxed);
|
|
365
646
|
}
|
|
366
647
|
|
|
367
|
-
|
|
368
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
369
|
-
// operations on the same MemTable (unless this Memtable is immutable).
|
|
370
|
-
uint64_t num_deletes() const {
|
|
648
|
+
uint64_t NumDeletion() const override {
|
|
371
649
|
return num_deletes_.load(std::memory_order_relaxed);
|
|
372
650
|
}
|
|
373
651
|
|
|
374
|
-
|
|
375
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
376
|
-
// operations on the same MemTable (unless this Memtable is immutable).
|
|
377
|
-
uint64_t num_range_deletes() const {
|
|
652
|
+
uint64_t NumRangeDeletion() const override {
|
|
378
653
|
return num_range_deletes_.load(std::memory_order_relaxed);
|
|
379
654
|
}
|
|
380
655
|
|
|
381
|
-
uint64_t
|
|
656
|
+
uint64_t GetDataSize() const override {
|
|
382
657
|
return data_size_.load(std::memory_order_relaxed);
|
|
383
658
|
}
|
|
384
659
|
|
|
@@ -398,19 +673,9 @@ class MemTable {
|
|
|
398
673
|
}
|
|
399
674
|
}
|
|
400
675
|
|
|
401
|
-
|
|
402
|
-
VersionEdit* GetEdits() { return &edit_; }
|
|
403
|
-
|
|
404
|
-
// Returns if there is no entry inserted to the mem table.
|
|
405
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
406
|
-
// operations on the same MemTable (unless this Memtable is immutable).
|
|
407
|
-
bool IsEmpty() const { return first_seqno_ == 0; }
|
|
676
|
+
bool IsEmpty() const override { return first_seqno_ == 0; }
|
|
408
677
|
|
|
409
|
-
|
|
410
|
-
// into the memtable.
|
|
411
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
412
|
-
// operations on the same MemTable (unless this Memtable is immutable).
|
|
413
|
-
SequenceNumber GetFirstSequenceNumber() {
|
|
678
|
+
SequenceNumber GetFirstSequenceNumber() override {
|
|
414
679
|
return first_seqno_.load(std::memory_order_relaxed);
|
|
415
680
|
}
|
|
416
681
|
|
|
@@ -422,14 +687,8 @@ class MemTable {
|
|
|
422
687
|
return first_seqno_.store(first_seqno, std::memory_order_relaxed);
|
|
423
688
|
}
|
|
424
689
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
// memtable. It can then be assumed that any write with a larger(or equal)
|
|
428
|
-
// sequence number will be present in this memtable or a later memtable.
|
|
429
|
-
//
|
|
430
|
-
// If the earliest sequence number could not be determined,
|
|
431
|
-
// kMaxSequenceNumber will be returned.
|
|
432
|
-
SequenceNumber GetEarliestSequenceNumber() {
|
|
690
|
+
SequenceNumber GetEarliestSequenceNumber() override {
|
|
691
|
+
// With file ingestion and empty memtable, this seqno needs to be fixed.
|
|
433
692
|
return earliest_seqno_.load(std::memory_order_relaxed);
|
|
434
693
|
}
|
|
435
694
|
|
|
@@ -448,40 +707,18 @@ class MemTable {
|
|
|
448
707
|
|
|
449
708
|
void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
|
|
450
709
|
|
|
451
|
-
//
|
|
452
|
-
//
|
|
453
|
-
//
|
|
454
|
-
// operations on the same MemTable.
|
|
455
|
-
uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
|
|
456
|
-
|
|
457
|
-
// Sets the next active logfile number when this memtable is about to
|
|
458
|
-
// be flushed to storage
|
|
459
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
460
|
-
// operations on the same MemTable.
|
|
461
|
-
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
|
|
462
|
-
|
|
463
|
-
// if this memtable contains data from a committed
|
|
464
|
-
// two phase transaction we must take note of the
|
|
465
|
-
// log which contains that data so we can know
|
|
466
|
-
// when to relese that log
|
|
710
|
+
// If this memtable contains data from a committed two phase transaction we
|
|
711
|
+
// must take note of the log which contains that data so we can know when
|
|
712
|
+
// to release that log.
|
|
467
713
|
void RefLogContainingPrepSection(uint64_t log);
|
|
468
|
-
uint64_t GetMinLogContainingPrepSection();
|
|
714
|
+
uint64_t GetMinLogContainingPrepSection() override;
|
|
469
715
|
|
|
470
|
-
|
|
471
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
472
|
-
// operations on the same MemTable.
|
|
473
|
-
// After MarkImmutable() is called, you should not attempt to
|
|
474
|
-
// write anything to this MemTable(). (Ie. do not call Add() or Update()).
|
|
475
|
-
void MarkImmutable() {
|
|
716
|
+
void MarkImmutable() override {
|
|
476
717
|
table_->MarkReadOnly();
|
|
477
718
|
mem_tracker_.DoneAllocating();
|
|
478
719
|
}
|
|
479
720
|
|
|
480
|
-
|
|
481
|
-
// persisted.
|
|
482
|
-
// REQUIRES: external synchronization to prevent simultaneous
|
|
483
|
-
// operations on the same MemTable.
|
|
484
|
-
void MarkFlushed() { table_->MarkFlushed(); }
|
|
721
|
+
void MarkFlushed() override { table_->MarkFlushed(); }
|
|
485
722
|
|
|
486
723
|
// return true if the current MemTableRep supports merge operator.
|
|
487
724
|
bool IsMergeOperatorSupported() const {
|
|
@@ -494,18 +731,13 @@ class MemTable {
|
|
|
494
731
|
return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
|
|
495
732
|
}
|
|
496
733
|
|
|
497
|
-
struct MemTableStats {
|
|
498
|
-
uint64_t size;
|
|
499
|
-
uint64_t count;
|
|
500
|
-
};
|
|
501
|
-
|
|
502
734
|
MemTableStats ApproximateStats(const Slice& start_ikey,
|
|
503
|
-
const Slice& end_ikey);
|
|
735
|
+
const Slice& end_ikey) override;
|
|
504
736
|
|
|
505
737
|
// Get the lock associated for the key
|
|
506
738
|
port::RWMutex* GetLock(const Slice& key);
|
|
507
739
|
|
|
508
|
-
const InternalKeyComparator& GetInternalKeyComparator() const {
|
|
740
|
+
const InternalKeyComparator& GetInternalKeyComparator() const override {
|
|
509
741
|
return comparator_.comparator;
|
|
510
742
|
}
|
|
511
743
|
|
|
@@ -513,33 +745,10 @@ class MemTable {
|
|
|
513
745
|
return &moptions_;
|
|
514
746
|
}
|
|
515
747
|
|
|
516
|
-
uint64_t ApproximateOldestKeyTime() const {
|
|
748
|
+
uint64_t ApproximateOldestKeyTime() const override {
|
|
517
749
|
return oldest_key_time_.load(std::memory_order_relaxed);
|
|
518
750
|
}
|
|
519
751
|
|
|
520
|
-
// REQUIRES: db_mutex held.
|
|
521
|
-
void SetID(uint64_t id) { id_ = id; }
|
|
522
|
-
|
|
523
|
-
uint64_t GetID() const { return id_; }
|
|
524
|
-
|
|
525
|
-
void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
|
|
526
|
-
|
|
527
|
-
uint64_t GetFileNumber() const { return file_number_; }
|
|
528
|
-
|
|
529
|
-
void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
|
|
530
|
-
|
|
531
|
-
void SetFlushInProgress(bool in_progress) {
|
|
532
|
-
flush_in_progress_ = in_progress;
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
|
|
536
|
-
flush_job_info_ = std::move(info);
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
|
|
540
|
-
return std::move(flush_job_info_);
|
|
541
|
-
}
|
|
542
|
-
|
|
543
752
|
// Returns a heuristic flush decision
|
|
544
753
|
bool ShouldFlushNow();
|
|
545
754
|
|
|
@@ -550,23 +759,12 @@ class MemTable {
|
|
|
550
759
|
// SwitchMemtable() may fail.
|
|
551
760
|
void ConstructFragmentedRangeTombstones();
|
|
552
761
|
|
|
553
|
-
|
|
554
|
-
// for this memtable. It should be constructed right before a memtable is
|
|
555
|
-
// added to an immutable memtable list. Note that if a memtable does not have
|
|
556
|
-
// any range tombstone, then no range tombstone list will ever be constructed
|
|
557
|
-
// and true is returned in that case.
|
|
558
|
-
bool IsFragmentedRangeTombstonesConstructed() const {
|
|
762
|
+
bool IsFragmentedRangeTombstonesConstructed() const override {
|
|
559
763
|
return fragmented_range_tombstone_list_.get() != nullptr ||
|
|
560
764
|
is_range_del_table_empty_;
|
|
561
765
|
}
|
|
562
766
|
|
|
563
|
-
|
|
564
|
-
// `newest_udt_` for what newer means. This method should only be invoked for
|
|
565
|
-
// an MemTable that has enabled user-defined timestamp feature and set
|
|
566
|
-
// `persist_user_defined_timestamps` to false. The tracked newest UDT will be
|
|
567
|
-
// used by flush job in the background to help check the MemTable's
|
|
568
|
-
// eligibility for Flush.
|
|
569
|
-
const Slice& GetNewestUDT() const;
|
|
767
|
+
const Slice& GetNewestUDT() const override;
|
|
570
768
|
|
|
571
769
|
// Returns Corruption status if verification fails.
|
|
572
770
|
static Status VerifyEntryChecksum(const char* entry,
|
|
@@ -582,7 +780,6 @@ class MemTable {
|
|
|
582
780
|
|
|
583
781
|
KeyComparator comparator_;
|
|
584
782
|
const ImmutableMemTableOptions moptions_;
|
|
585
|
-
int refs_;
|
|
586
783
|
const size_t kArenaBlockSize;
|
|
587
784
|
AllocTracker mem_tracker_;
|
|
588
785
|
ConcurrentArena arena_;
|
|
@@ -599,15 +796,6 @@ class MemTable {
|
|
|
599
796
|
// Dynamically changeable memtable option
|
|
600
797
|
std::atomic<size_t> write_buffer_size_;
|
|
601
798
|
|
|
602
|
-
// These are used to manage memtable flushes to storage
|
|
603
|
-
bool flush_in_progress_; // started the flush
|
|
604
|
-
bool flush_completed_; // finished the flush
|
|
605
|
-
uint64_t file_number_; // filled up after flush is complete
|
|
606
|
-
|
|
607
|
-
// The updates to be applied to the transaction log when this
|
|
608
|
-
// memtable is flushed to storage.
|
|
609
|
-
VersionEdit edit_;
|
|
610
|
-
|
|
611
799
|
// The sequence number of the kv that was inserted first
|
|
612
800
|
std::atomic<SequenceNumber> first_seqno_;
|
|
613
801
|
|
|
@@ -617,9 +805,6 @@ class MemTable {
|
|
|
617
805
|
|
|
618
806
|
SequenceNumber creation_seq_;
|
|
619
807
|
|
|
620
|
-
// The log files earlier than this number can be deleted.
|
|
621
|
-
uint64_t mem_next_logfile_number_;
|
|
622
|
-
|
|
623
808
|
// the earliest log containing a prepared section
|
|
624
809
|
// which has been inserted into this memtable.
|
|
625
810
|
std::atomic<uint64_t> min_prep_log_referenced_;
|
|
@@ -643,15 +828,6 @@ class MemTable {
|
|
|
643
828
|
// Timestamp of oldest key
|
|
644
829
|
std::atomic<uint64_t> oldest_key_time_;
|
|
645
830
|
|
|
646
|
-
// Memtable id to track flush.
|
|
647
|
-
uint64_t id_ = 0;
|
|
648
|
-
|
|
649
|
-
// Sequence number of the atomic flush that is responsible for this memtable.
|
|
650
|
-
// The sequence number of atomic flush is a seq, such that no writes with
|
|
651
|
-
// sequence numbers greater than or equal to seq are flushed, while all
|
|
652
|
-
// writes with sequence number smaller than seq are flushed.
|
|
653
|
-
SequenceNumber atomic_flush_seqno_;
|
|
654
|
-
|
|
655
831
|
// keep track of memory usage in table_, arena_, and range_del_table_.
|
|
656
832
|
// Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
|
|
657
833
|
std::atomic<uint64_t> approximate_memory_usage_;
|
|
@@ -660,9 +836,6 @@ class MemTable {
|
|
|
660
836
|
// unlimited.
|
|
661
837
|
uint32_t memtable_max_range_deletions_ = 0;
|
|
662
838
|
|
|
663
|
-
// Flush job info of the current memtable.
|
|
664
|
-
std::unique_ptr<FlushJobInfo> flush_job_info_;
|
|
665
|
-
|
|
666
839
|
// Size in bytes for the user-defined timestamps.
|
|
667
840
|
size_t ts_sz_;
|
|
668
841
|
|
|
@@ -704,6 +877,12 @@ class MemTable {
|
|
|
704
877
|
std::unique_ptr<FragmentedRangeTombstoneList>
|
|
705
878
|
fragmented_range_tombstone_list_;
|
|
706
879
|
|
|
880
|
+
// The fragmented range tombstone of this memtable with all keys' user-defined
|
|
881
|
+
// timestamps logically stripped. This is constructed and used by flush when
|
|
882
|
+
// user-defined timestamps in memtable only feature is enabled.
|
|
883
|
+
std::unique_ptr<FragmentedRangeTombstoneList>
|
|
884
|
+
timestamp_stripping_fragmented_range_tombstone_list_;
|
|
885
|
+
|
|
707
886
|
// makes sure there is a single range tombstone writer to invalidate cache
|
|
708
887
|
std::mutex range_del_mutex_;
|
|
709
888
|
CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
|