@nxtedition/rocksdb 13.1.4 → 13.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +43 -16
- package/deps/rocksdb/rocksdb/{TARGETS → BUCK} +27 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -1
- package/deps/rocksdb/rocksdb/Makefile +2 -2
- package/deps/rocksdb/rocksdb/cache/cache.cc +3 -1
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +2 -0
- package/deps/rocksdb/rocksdb/db/attribute_group_iterator_impl.h +34 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +7 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +22 -14
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +149 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +13 -24
- package/deps/rocksdb/rocksdb/db/coalescing_iterator.h +35 -10
- package/deps/rocksdb/rocksdb/db/column_family.cc +21 -10
- package/deps/rocksdb/rocksdb/db/column_family.h +15 -8
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +98 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +126 -16
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +51 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +24 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +52 -22
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +9 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +36 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +6 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +30 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +26 -23
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +43 -33
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +6 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +19 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +6 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +632 -411
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +171 -51
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +7 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +37 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +51 -11
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +10 -3
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +350 -154
- package/deps/rocksdb/rocksdb/db/convenience.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +62 -27
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +68 -1
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +91 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +134 -70
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +71 -23
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -16
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +47 -33
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +27 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +38 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +7 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +258 -42
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +161 -9
- package/deps/rocksdb/rocksdb/db/db_iter.cc +118 -86
- package/deps/rocksdb/rocksdb/db/db_iter.h +44 -17
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +27 -6
- package/deps/rocksdb/rocksdb/db/db_test.cc +48 -16
- package/deps/rocksdb/rocksdb/db/db_test2.cc +60 -15
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +97 -44
- package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -1
- package/deps/rocksdb/rocksdb/db/dbformat.cc +15 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +137 -55
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +54 -0
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +663 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +152 -91
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +134 -11
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +55 -9
- package/deps/rocksdb/rocksdb/db/flush_job.cc +52 -29
- package/deps/rocksdb/rocksdb/db/flush_job.h +5 -3
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +18 -12
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +23 -29
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +3 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +9 -6
- package/deps/rocksdb/rocksdb/db/internal_stats.h +54 -0
- package/deps/rocksdb/rocksdb/db/job_context.h +1 -1
- package/deps/rocksdb/rocksdb/db/log_reader.cc +6 -7
- package/deps/rocksdb/rocksdb/db/manifest_ops.cc +47 -0
- package/deps/rocksdb/rocksdb/db/manifest_ops.h +20 -0
- package/deps/rocksdb/rocksdb/db/memtable.cc +165 -64
- package/deps/rocksdb/rocksdb/db/memtable.h +422 -243
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +99 -68
- package/deps/rocksdb/rocksdb/db/memtable_list.h +63 -38
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +28 -25
- package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +118 -60
- package/deps/rocksdb/rocksdb/db/multi_cf_iterator_test.cc +344 -89
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +2 -3
- package/deps/rocksdb/rocksdb/db/repair.cc +15 -14
- package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -13
- package/deps/rocksdb/rocksdb/db/snapshot_checker.h +7 -0
- package/deps/rocksdb/rocksdb/db/table_cache.cc +62 -65
- package/deps/rocksdb/rocksdb/db/table_cache.h +70 -76
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +5 -6
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +8 -7
- package/deps/rocksdb/rocksdb/db/version_builder.cc +17 -19
- package/deps/rocksdb/rocksdb/db/version_builder.h +13 -12
- package/deps/rocksdb/rocksdb/db/version_edit.h +30 -0
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +3 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +89 -129
- package/deps/rocksdb/rocksdb/db/version_set.h +12 -4
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +12 -8
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +0 -15
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +0 -2
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +9 -7
- package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +0 -8
- package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.h +28 -2
- package/deps/rocksdb/rocksdb/db/write_batch.cc +32 -10
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +9 -0
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/write_thread.cc +3 -1
- package/deps/rocksdb/rocksdb/db/write_thread.h +6 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +15 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +100 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +34 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +223 -78
- package/deps/rocksdb/rocksdb/env/file_system.cc +6 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +53 -0
- package/deps/rocksdb/rocksdb/env/io_posix.cc +63 -17
- package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +132 -48
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +92 -24
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +727 -109
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +3 -4
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +1 -1
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/attribute_groups.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +9 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +9 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +10 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +34 -37
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +21 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +56 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +36 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +11 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +84 -60
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +102 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +89 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +32 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +30 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +23 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -0
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +79 -21
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +41 -18
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +1 -5
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +169 -0
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +400 -0
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -0
- package/deps/rocksdb/rocksdb/options/cf_options.cc +137 -82
- package/deps/rocksdb/rocksdb/options/cf_options.h +18 -6
- package/deps/rocksdb/rocksdb/options/configurable.cc +31 -17
- package/deps/rocksdb/rocksdb/options/configurable_helper.h +7 -6
- package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -8
- package/deps/rocksdb/rocksdb/options/options_parser.cc +74 -54
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +89 -0
- package/deps/rocksdb/rocksdb/options/options_test.cc +112 -26
- package/deps/rocksdb/rocksdb/port/port.h +5 -9
- package/deps/rocksdb/rocksdb/src.mk +8 -0
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +62 -80
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +13 -3
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +16 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +38 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +204 -1
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/format.cc +3 -3
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +4 -1
- package/deps/rocksdb/rocksdb/table/mock_table.cc +0 -50
- package/deps/rocksdb/rocksdb/table/mock_table.h +53 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +4 -0
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -5
- package/deps/rocksdb/rocksdb/table/table_builder.h +3 -1
- package/deps/rocksdb/rocksdb/table/table_properties.cc +181 -0
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +5 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +71 -64
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py +45 -45
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +35 -35
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +43 -43
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +41 -4
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +1 -1
- package/deps/rocksdb/rocksdb/unreleased_history/add.sh +13 -0
- package/deps/rocksdb/rocksdb/util/aligned_buffer.h +24 -5
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +7 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +0 -52
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +1 -10
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +92 -0
- package/deps/rocksdb/rocksdb/util/thread_operation.h +1 -0
- package/deps/rocksdb/rocksdb/util/udt_util.cc +50 -4
- package/deps/rocksdb/rocksdb/util/udt_util.h +24 -11
- package/deps/rocksdb/rocksdb/util/udt_util_test.cc +26 -13
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +1 -16
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +214 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.h +60 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +124 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +441 -0
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +34 -3
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +7 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +437 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +34 -11
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +14 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +7 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +17 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +69 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +20 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1290 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +324 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +8 -1
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +57 -12
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +32 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +33 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +721 -9
- package/deps/rocksdb/rocksdb.gyp +2 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
#include "env/io_posix.h"
|
|
52
52
|
#include "monitoring/iostats_context_imp.h"
|
|
53
53
|
#include "monitoring/thread_status_updater.h"
|
|
54
|
+
#include "options/db_options.h"
|
|
54
55
|
#include "port/lang.h"
|
|
55
56
|
#include "port/port.h"
|
|
56
57
|
#include "rocksdb/options.h"
|
|
@@ -930,6 +931,28 @@ class PosixFileSystem : public FileSystem {
|
|
|
930
931
|
optimized.fallocate_with_keep_size = true;
|
|
931
932
|
return optimized;
|
|
932
933
|
}
|
|
934
|
+
|
|
935
|
+
FileOptions OptimizeForCompactionTableRead(
|
|
936
|
+
const FileOptions& file_options,
|
|
937
|
+
const ImmutableDBOptions& db_options) const override {
|
|
938
|
+
FileOptions fo = FileOptions(file_options);
|
|
939
|
+
#ifdef OS_LINUX
|
|
940
|
+
// To fix https://github.com/facebook/rocksdb/issues/12038
|
|
941
|
+
if (!file_options.use_direct_reads &&
|
|
942
|
+
file_options.compaction_readahead_size > 0) {
|
|
943
|
+
size_t system_limit =
|
|
944
|
+
GetCompactionReadaheadSizeSystemLimit(db_options.db_paths);
|
|
945
|
+
if (system_limit > 0 &&
|
|
946
|
+
file_options.compaction_readahead_size > system_limit) {
|
|
947
|
+
fo.compaction_readahead_size = system_limit;
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
#else
|
|
951
|
+
(void)db_options;
|
|
952
|
+
#endif
|
|
953
|
+
return fo;
|
|
954
|
+
}
|
|
955
|
+
|
|
933
956
|
#ifdef OS_LINUX
|
|
934
957
|
Status RegisterDbPaths(const std::vector<std::string>& paths) override {
|
|
935
958
|
return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
|
|
@@ -942,6 +965,36 @@ class PosixFileSystem : public FileSystem {
|
|
|
942
965
|
private:
|
|
943
966
|
bool forceMmapOff_ = false; // do we override Env options?
|
|
944
967
|
|
|
968
|
+
#ifdef OS_LINUX
|
|
969
|
+
// Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
|
|
970
|
+
// can issue to block devices under a directory, also known as
|
|
971
|
+
// "max_sectors_kb" ) among `db_paths`.
|
|
972
|
+
// Return 0 if no limit can be found or there is an error in
|
|
973
|
+
// retrieving such limit.
|
|
974
|
+
static size_t GetCompactionReadaheadSizeSystemLimit(
|
|
975
|
+
const std::vector<DbPath>& db_paths) {
|
|
976
|
+
Status s;
|
|
977
|
+
size_t limit_kb = 0;
|
|
978
|
+
|
|
979
|
+
for (const auto& db_path : db_paths) {
|
|
980
|
+
size_t dir_max_sectors_kb = 0;
|
|
981
|
+
s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path,
|
|
982
|
+
&dir_max_sectors_kb);
|
|
983
|
+
if (!s.ok()) {
|
|
984
|
+
break;
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
limit_kb = (limit_kb == 0) ? dir_max_sectors_kb
|
|
988
|
+
: std::min(limit_kb, dir_max_sectors_kb);
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
if (s.ok()) {
|
|
992
|
+
return limit_kb * 1024;
|
|
993
|
+
} else {
|
|
994
|
+
return 0;
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
#endif
|
|
945
998
|
// Returns true iff the named directory exists and is a directory.
|
|
946
999
|
virtual bool DirExists(const std::string& dname) {
|
|
947
1000
|
struct stat statbuf;
|
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
#include <cstdio>
|
|
29
29
|
#include <cstdlib>
|
|
30
30
|
#include <cstring>
|
|
31
|
-
#
|
|
31
|
+
#if defined(OS_LINUX) || defined(OS_ANDROID)
|
|
32
32
|
#include <sys/statfs.h>
|
|
33
33
|
#include <sys/sysmacros.h>
|
|
34
34
|
#endif
|
|
@@ -455,38 +455,71 @@ size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
|
|
|
455
455
|
|
|
456
456
|
Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
|
|
457
457
|
size_t* size) {
|
|
458
|
+
return GetQueueSysfsFileValueofDirectory(directory,
|
|
459
|
+
GetLogicalBlockSizeFileName(), size);
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory,
|
|
463
|
+
size_t* kb) {
|
|
464
|
+
return GetQueueSysfsFileValueofDirectory(directory, GetMaxSectorsKBFileName(),
|
|
465
|
+
kb);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
Status PosixHelper::GetQueueSysfsFileValueofDirectory(
|
|
469
|
+
const std::string& directory, const std::string& file_name, size_t* value) {
|
|
458
470
|
int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
|
|
459
471
|
if (fd == -1) {
|
|
460
472
|
return Status::IOError("Cannot open directory " + directory);
|
|
461
473
|
}
|
|
462
|
-
|
|
474
|
+
if (file_name == PosixHelper::GetLogicalBlockSizeFileName()) {
|
|
475
|
+
*value = PosixHelper::GetLogicalBlockSizeOfFd(fd);
|
|
476
|
+
} else if (file_name == PosixHelper::GetMaxSectorsKBFileName()) {
|
|
477
|
+
*value = PosixHelper::GetMaxSectorsKBOfFd(fd);
|
|
478
|
+
} else {
|
|
479
|
+
assert(false);
|
|
480
|
+
}
|
|
463
481
|
close(fd);
|
|
464
482
|
return Status::OK();
|
|
465
483
|
}
|
|
466
484
|
|
|
467
485
|
size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
|
486
|
+
return GetQueueSysfsFileValueOfFd(fd, GetLogicalBlockSizeFileName(),
|
|
487
|
+
kDefaultPageSize);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) {
|
|
491
|
+
return GetQueueSysfsFileValueOfFd(fd, GetMaxSectorsKBFileName(),
|
|
492
|
+
kDefaultMaxSectorsKB);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
size_t PosixHelper::GetQueueSysfsFileValueOfFd(
|
|
496
|
+
int fd, const std::string& file_name, const size_t default_return_value) {
|
|
468
497
|
#ifdef OS_LINUX
|
|
469
498
|
struct stat buf;
|
|
470
499
|
int result = fstat(fd, &buf);
|
|
471
500
|
if (result == -1) {
|
|
472
|
-
return
|
|
501
|
+
return default_return_value;
|
|
473
502
|
}
|
|
503
|
+
|
|
504
|
+
// Get device number
|
|
474
505
|
if (major(buf.st_dev) == 0) {
|
|
475
506
|
// Unnamed devices (e.g. non-device mounts), reserved as null device number.
|
|
476
507
|
// These don't have an entry in /sys/dev/block/. Return a sensible default.
|
|
477
|
-
return
|
|
508
|
+
return default_return_value;
|
|
478
509
|
}
|
|
479
510
|
|
|
480
|
-
//
|
|
511
|
+
// Get device path
|
|
481
512
|
const int kBufferSize = 100;
|
|
482
513
|
char path[kBufferSize];
|
|
483
514
|
char real_path[PATH_MAX + 1];
|
|
484
515
|
snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
|
|
485
516
|
minor(buf.st_dev));
|
|
486
517
|
if (realpath(path, real_path) == nullptr) {
|
|
487
|
-
return
|
|
518
|
+
return default_return_value;
|
|
488
519
|
}
|
|
489
520
|
std::string device_dir(real_path);
|
|
521
|
+
|
|
522
|
+
// Get the queue sysfs file path
|
|
490
523
|
if (!device_dir.empty() && device_dir.back() == '/') {
|
|
491
524
|
device_dir.pop_back();
|
|
492
525
|
}
|
|
@@ -500,11 +533,11 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
|
|
500
533
|
// ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
|
|
501
534
|
size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
|
|
502
535
|
if (parent_end == std::string::npos) {
|
|
503
|
-
return
|
|
536
|
+
return default_return_value;
|
|
504
537
|
}
|
|
505
538
|
size_t parent_begin = device_dir.rfind('/', parent_end - 1);
|
|
506
539
|
if (parent_begin == std::string::npos) {
|
|
507
|
-
return
|
|
540
|
+
return default_return_value;
|
|
508
541
|
}
|
|
509
542
|
std::string parent =
|
|
510
543
|
device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
|
|
@@ -513,25 +546,37 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
|
|
513
546
|
(child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
|
|
514
547
|
device_dir = device_dir.substr(0, parent_end);
|
|
515
548
|
}
|
|
516
|
-
std::string fname = device_dir + "/queue/
|
|
549
|
+
std::string fname = device_dir + "/queue/" + file_name;
|
|
550
|
+
|
|
551
|
+
// Get value in the queue sysfs file
|
|
517
552
|
FILE* fp;
|
|
518
|
-
size_t
|
|
553
|
+
size_t value = 0;
|
|
519
554
|
fp = fopen(fname.c_str(), "r");
|
|
520
555
|
if (fp != nullptr) {
|
|
521
556
|
char* line = nullptr;
|
|
522
557
|
size_t len = 0;
|
|
523
558
|
if (getline(&line, &len, fp) != -1) {
|
|
524
|
-
sscanf(line, "%zu", &
|
|
559
|
+
sscanf(line, "%zu", &value);
|
|
525
560
|
}
|
|
526
561
|
free(line);
|
|
527
562
|
fclose(fp);
|
|
528
563
|
}
|
|
529
|
-
|
|
530
|
-
|
|
564
|
+
|
|
565
|
+
if (file_name == GetLogicalBlockSizeFileName()) {
|
|
566
|
+
if (value != 0 && (value & (value - 1)) == 0) {
|
|
567
|
+
return value;
|
|
568
|
+
}
|
|
569
|
+
} else if (file_name == GetMaxSectorsKBFileName()) {
|
|
570
|
+
if (value != 0) {
|
|
571
|
+
return value;
|
|
572
|
+
}
|
|
573
|
+
} else {
|
|
574
|
+
assert(false);
|
|
531
575
|
}
|
|
532
576
|
#endif
|
|
533
577
|
(void)fd;
|
|
534
|
-
|
|
578
|
+
(void)file_name;
|
|
579
|
+
return default_return_value;
|
|
535
580
|
}
|
|
536
581
|
|
|
537
582
|
/*
|
|
@@ -1376,9 +1421,10 @@ IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
|
|
|
1376
1421
|
// After ftruncate, we check whether ftruncate has the correct behavior.
|
|
1377
1422
|
// If not, we should hack it with FALLOC_FL_PUNCH_HOLE
|
|
1378
1423
|
if (result == 0 &&
|
|
1379
|
-
(file_stats.st_size + file_stats.st_blksize - 1) /
|
|
1380
|
-
|
|
1381
|
-
file_stats.st_blocks /
|
|
1424
|
+
static_cast<size_t>((file_stats.st_size + file_stats.st_blksize - 1) /
|
|
1425
|
+
file_stats.st_blksize) !=
|
|
1426
|
+
static_cast<size_t>(file_stats.st_blocks /
|
|
1427
|
+
(file_stats.st_blksize / 512))) {
|
|
1382
1428
|
IOSTATS_TIMER_GUARD(allocate_nanos);
|
|
1383
1429
|
if (allow_fallocate_) {
|
|
1384
1430
|
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
|
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
// For non linux platform, the following macros are used only as place
|
|
31
31
|
// holder.
|
|
32
32
|
#if !(defined OS_LINUX) && !(defined OS_FREEBSD) && !(defined CYGWIN) && \
|
|
33
|
-
!(defined OS_AIX)
|
|
33
|
+
!(defined OS_AIX) && !(defined OS_ANDROID)
|
|
34
34
|
#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
|
|
35
35
|
#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
|
|
36
36
|
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
|
|
@@ -53,10 +53,39 @@ IOStatus IOError(const std::string& context, const std::string& file_name,
|
|
|
53
53
|
|
|
54
54
|
class PosixHelper {
|
|
55
55
|
public:
|
|
56
|
+
static const std::string& GetLogicalBlockSizeFileName() {
|
|
57
|
+
static const std::string kLogicalBlockSizeFileName = "logical_block_size";
|
|
58
|
+
return kLogicalBlockSizeFileName;
|
|
59
|
+
}
|
|
60
|
+
static const std::string& GetMaxSectorsKBFileName() {
|
|
61
|
+
static const std::string kMaxSectorsKBFileName = "max_sectors_kb";
|
|
62
|
+
return kMaxSectorsKBFileName;
|
|
63
|
+
}
|
|
56
64
|
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
|
|
57
65
|
static size_t GetLogicalBlockSizeOfFd(int fd);
|
|
58
66
|
static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
|
|
59
67
|
size_t* size);
|
|
68
|
+
|
|
69
|
+
static Status GetMaxSectorsKBOfDirectory(const std::string& directory,
|
|
70
|
+
size_t* kb);
|
|
71
|
+
|
|
72
|
+
private:
|
|
73
|
+
static const size_t kDefaultMaxSectorsKB = 2 * 1024;
|
|
74
|
+
|
|
75
|
+
static size_t GetMaxSectorsKBOfFd(int fd);
|
|
76
|
+
|
|
77
|
+
// Return the value in the specified `file_name` under
|
|
78
|
+
// `/sys/block/xxx/queue/` for the device where the file of `fd` is on.
|
|
79
|
+
// If not found, then return the specified `default_return_value`
|
|
80
|
+
static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name,
|
|
81
|
+
size_t default_return_value);
|
|
82
|
+
|
|
83
|
+
/// Return the value in the specified `file_name` under
|
|
84
|
+
// `/sys/block/xxx/queue/` for the device where `directory` is on.
|
|
85
|
+
// If not found, then return the specified `default_return_value`
|
|
86
|
+
static Status GetQueueSysfsFileValueofDirectory(const std::string& directory,
|
|
87
|
+
const std::string& file_name,
|
|
88
|
+
size_t* value);
|
|
60
89
|
};
|
|
61
90
|
|
|
62
91
|
/*
|
|
@@ -22,11 +22,9 @@
|
|
|
22
22
|
|
|
23
23
|
namespace ROCKSDB_NAMESPACE {
|
|
24
24
|
|
|
25
|
-
void FilePrefetchBuffer::PrepareBufferForRead(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
bool refit_tail,
|
|
29
|
-
uint64_t& aligned_useful_len) {
|
|
25
|
+
void FilePrefetchBuffer::PrepareBufferForRead(
|
|
26
|
+
BufferInfo* buf, size_t alignment, uint64_t offset, size_t roundup_len,
|
|
27
|
+
bool refit_tail, bool use_fs_buffer, uint64_t& aligned_useful_len) {
|
|
30
28
|
uint64_t aligned_useful_offset_in_buf = 0;
|
|
31
29
|
bool copy_data_to_new_buffer = false;
|
|
32
30
|
// Check if requested bytes are in the existing buffer_.
|
|
@@ -39,6 +37,9 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
|
|
|
39
37
|
// new buffer is created.
|
|
40
38
|
aligned_useful_offset_in_buf =
|
|
41
39
|
Rounddown(static_cast<size_t>(offset - buf->offset_), alignment);
|
|
40
|
+
// aligned_useful_len is passed by reference and used to calculate how much
|
|
41
|
+
// data needs to be read, so it is needed regardless of whether
|
|
42
|
+
// use_fs_buffer is true
|
|
42
43
|
aligned_useful_len = static_cast<uint64_t>(buf->CurrentSize()) -
|
|
43
44
|
aligned_useful_offset_in_buf;
|
|
44
45
|
assert(aligned_useful_offset_in_buf % alignment == 0);
|
|
@@ -53,6 +54,16 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
|
|
|
53
54
|
}
|
|
54
55
|
}
|
|
55
56
|
|
|
57
|
+
// The later buffer allocation / tail refitting does not apply when
|
|
58
|
+
// use_fs_buffer is true. If we allocate a new buffer, we end up throwing it
|
|
59
|
+
// away later when we reuse the file system allocated buffer. If we refit
|
|
60
|
+
// the tail in the main buffer, we don't have a place to put the next chunk of
|
|
61
|
+
// data provided by the file system (without performing another copy, which we
|
|
62
|
+
// are trying to avoid in the first place)
|
|
63
|
+
if (use_fs_buffer) {
|
|
64
|
+
return;
|
|
65
|
+
}
|
|
66
|
+
|
|
56
67
|
// Create a new buffer only if current capacity is not sufficient, and memcopy
|
|
57
68
|
// bytes from old buffer if needed (i.e., if aligned_useful_len is greater
|
|
58
69
|
// than 0).
|
|
@@ -62,8 +73,8 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
|
|
|
62
73
|
static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
|
|
63
74
|
aligned_useful_offset_in_buf, static_cast<size_t>(aligned_useful_len));
|
|
64
75
|
} else if (aligned_useful_len > 0 && refit_tail) {
|
|
65
|
-
// New buffer not needed. But memmove bytes from tail to the beginning
|
|
66
|
-
// aligned_useful_len is greater than 0.
|
|
76
|
+
// New buffer not needed. But memmove bytes from tail to the beginning
|
|
77
|
+
// since aligned_useful_len is greater than 0.
|
|
67
78
|
buf->buffer_.RefitTail(static_cast<size_t>(aligned_useful_offset_in_buf),
|
|
68
79
|
static_cast<size_t>(aligned_useful_len));
|
|
69
80
|
} else if (aligned_useful_len > 0) {
|
|
@@ -82,11 +93,19 @@ void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
|
|
|
82
93
|
Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
|
|
83
94
|
RandomAccessFileReader* reader,
|
|
84
95
|
uint64_t read_len, uint64_t aligned_useful_len,
|
|
85
|
-
uint64_t start_offset) {
|
|
96
|
+
uint64_t start_offset, bool use_fs_buffer) {
|
|
86
97
|
Slice result;
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
98
|
+
Status s;
|
|
99
|
+
char* to_buf = nullptr;
|
|
100
|
+
if (use_fs_buffer) {
|
|
101
|
+
s = FSBufferDirectRead(reader, buf, opts, start_offset + aligned_useful_len,
|
|
102
|
+
read_len, result);
|
|
103
|
+
} else {
|
|
104
|
+
to_buf = buf->buffer_.BufferStart() + aligned_useful_len;
|
|
105
|
+
s = reader->Read(opts, start_offset + aligned_useful_len, read_len, &result,
|
|
106
|
+
to_buf, /*aligned_buf=*/nullptr);
|
|
107
|
+
}
|
|
108
|
+
|
|
90
109
|
#ifndef NDEBUG
|
|
91
110
|
if (result.size() < read_len) {
|
|
92
111
|
// Fake an IO error to force db_stress fault injection to ignore
|
|
@@ -97,7 +116,7 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
|
|
|
97
116
|
if (!s.ok()) {
|
|
98
117
|
return s;
|
|
99
118
|
}
|
|
100
|
-
if (result.data() != to_buf) {
|
|
119
|
+
if (!use_fs_buffer && result.data() != to_buf) {
|
|
101
120
|
// If the read is coming from some other buffer already in memory (such as
|
|
102
121
|
// mmap) then it would be inefficient to create another copy in this
|
|
103
122
|
// FilePrefetchBuffer. The caller is expected to exclude this case.
|
|
@@ -108,8 +127,11 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
|
|
|
108
127
|
if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
|
|
109
128
|
RecordTick(stats_, PREFETCH_BYTES, read_len);
|
|
110
129
|
}
|
|
111
|
-
|
|
112
|
-
|
|
130
|
+
if (!use_fs_buffer) {
|
|
131
|
+
// Update the buffer size.
|
|
132
|
+
// We already explicitly set the buffer size when we reuse the FS buffer
|
|
133
|
+
buf->buffer_.Size(static_cast<size_t>(aligned_useful_len) + result.size());
|
|
134
|
+
}
|
|
113
135
|
return s;
|
|
114
136
|
}
|
|
115
137
|
|
|
@@ -157,33 +179,42 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
|
|
|
157
179
|
return Status::OK();
|
|
158
180
|
}
|
|
159
181
|
|
|
160
|
-
size_t alignment =
|
|
182
|
+
size_t alignment = GetRequiredBufferAlignment(reader);
|
|
161
183
|
uint64_t rounddown_offset = offset, roundup_end = 0, aligned_useful_len = 0;
|
|
162
184
|
size_t read_len = 0;
|
|
185
|
+
// TODO: Enable file system buffer reuse optimization. Need to incorporate
|
|
186
|
+
// overlap buffer logic here (similar to what is done in PrefetchInternal).
|
|
187
|
+
// Currently, if we attempt to use the optimization, it results in an
|
|
188
|
+
// unsigned integer overflow because the returned buffer's offset ends up
|
|
189
|
+
// higher than the requested offset.
|
|
190
|
+
bool use_fs_buffer = false;
|
|
163
191
|
|
|
164
192
|
ReadAheadSizeTuning(buf, /*read_curr_block=*/true,
|
|
165
|
-
/*refit_tail=*/true,
|
|
166
|
-
rounddown_offset, roundup_end, read_len,
|
|
193
|
+
/*refit_tail=*/true, use_fs_buffer, rounddown_offset,
|
|
194
|
+
alignment, 0, n, rounddown_offset, roundup_end, read_len,
|
|
167
195
|
aligned_useful_len);
|
|
168
196
|
|
|
169
197
|
Status s;
|
|
170
198
|
if (read_len > 0) {
|
|
171
|
-
s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset
|
|
199
|
+
s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset,
|
|
200
|
+
use_fs_buffer);
|
|
172
201
|
}
|
|
173
202
|
|
|
174
203
|
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) {
|
|
175
204
|
RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len);
|
|
176
205
|
}
|
|
206
|
+
assert(buf->offset_ <= offset);
|
|
177
207
|
return s;
|
|
178
208
|
}
|
|
179
209
|
|
|
180
210
|
// Copy data from src to overlap_buf_.
|
|
181
|
-
void FilePrefetchBuffer::
|
|
182
|
-
|
|
211
|
+
void FilePrefetchBuffer::CopyDataToOverlapBuffer(BufferInfo* src,
|
|
212
|
+
uint64_t& offset,
|
|
213
|
+
size_t& length) {
|
|
183
214
|
if (length == 0) {
|
|
184
215
|
return;
|
|
185
216
|
}
|
|
186
|
-
|
|
217
|
+
assert(src->IsOffsetInBuffer(offset));
|
|
187
218
|
uint64_t copy_offset = (offset - src->offset_);
|
|
188
219
|
size_t copy_len = 0;
|
|
189
220
|
if (src->IsDataBlockInBuffer(offset, length)) {
|
|
@@ -194,10 +225,8 @@ void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
|
|
|
194
225
|
}
|
|
195
226
|
|
|
196
227
|
BufferInfo* dst = overlap_buf_;
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
dst->buffer_.Size(dst->CurrentSize() + copy_len);
|
|
228
|
+
assert(copy_len <= dst->buffer_.Capacity() - dst->buffer_.CurrentSize());
|
|
229
|
+
dst->buffer_.Append(src->buffer_.BufferStart() + copy_offset, copy_len);
|
|
201
230
|
|
|
202
231
|
// Update offset and length.
|
|
203
232
|
offset += copy_len;
|
|
@@ -208,6 +237,7 @@ void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
|
|
|
208
237
|
if (length > 0) {
|
|
209
238
|
FreeFrontBuffer();
|
|
210
239
|
}
|
|
240
|
+
TEST_SYNC_POINT("FilePrefetchBuffer::CopyDataToOverlapBuffer:Complete");
|
|
211
241
|
}
|
|
212
242
|
|
|
213
243
|
// Clear the buffers if it contains outdated data. Outdated data can be because
|
|
@@ -355,7 +385,7 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
|
|
|
355
385
|
// of ReadAsync to make sure it doesn't read anything from
|
|
356
386
|
// previous buffer which is already prefetched.
|
|
357
387
|
void FilePrefetchBuffer::ReadAheadSizeTuning(
|
|
358
|
-
BufferInfo* buf, bool read_curr_block, bool refit_tail,
|
|
388
|
+
BufferInfo* buf, bool read_curr_block, bool refit_tail, bool use_fs_buffer,
|
|
359
389
|
uint64_t prev_buf_end_offset, size_t alignment, size_t length,
|
|
360
390
|
size_t readahead_size, uint64_t& start_offset, uint64_t& end_offset,
|
|
361
391
|
size_t& read_len, uint64_t& aligned_useful_len) {
|
|
@@ -408,7 +438,7 @@ void FilePrefetchBuffer::ReadAheadSizeTuning(
|
|
|
408
438
|
uint64_t roundup_len = end_offset - start_offset;
|
|
409
439
|
|
|
410
440
|
PrepareBufferForRead(buf, alignment, start_offset, roundup_len, refit_tail,
|
|
411
|
-
aligned_useful_len);
|
|
441
|
+
use_fs_buffer, aligned_useful_len);
|
|
412
442
|
assert(roundup_len >= aligned_useful_len);
|
|
413
443
|
|
|
414
444
|
// Update the buffer offset.
|
|
@@ -422,11 +452,43 @@ void FilePrefetchBuffer::ReadAheadSizeTuning(
|
|
|
422
452
|
(end_offset - start_offset));
|
|
423
453
|
}
|
|
424
454
|
|
|
455
|
+
// This is for when num_buffers_ = 1.
|
|
456
|
+
// If we are reusing the file system allocated buffer, and only some of the
|
|
457
|
+
// requested data is in the buffer, we copy the relevant data to overlap_buf_
|
|
458
|
+
void FilePrefetchBuffer::HandleOverlappingSyncData(uint64_t offset,
|
|
459
|
+
size_t length,
|
|
460
|
+
uint64_t& tmp_offset,
|
|
461
|
+
size_t& tmp_length,
|
|
462
|
+
bool& use_overlap_buffer) {
|
|
463
|
+
if (IsBufferQueueEmpty()) {
|
|
464
|
+
return;
|
|
465
|
+
}
|
|
466
|
+
BufferInfo* buf = GetFirstBuffer();
|
|
467
|
+
// We should only be calling this when num_buffers_ = 1, so there should
|
|
468
|
+
// not be any async reads.
|
|
469
|
+
assert(!buf->async_read_in_progress_);
|
|
470
|
+
|
|
471
|
+
if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() &&
|
|
472
|
+
buf->IsOffsetInBuffer(offset) &&
|
|
473
|
+
buf->offset_ + buf->CurrentSize() < offset + length) {
|
|
474
|
+
// Allocated overlap_buf_ is just enough to hold the result for the user
|
|
475
|
+
// Alignment does not matter here
|
|
476
|
+
use_overlap_buffer = true;
|
|
477
|
+
overlap_buf_->ClearBuffer();
|
|
478
|
+
overlap_buf_->buffer_.Alignment(1);
|
|
479
|
+
overlap_buf_->buffer_.AllocateNewBuffer(length);
|
|
480
|
+
overlap_buf_->offset_ = offset;
|
|
481
|
+
CopyDataToOverlapBuffer(buf, tmp_offset, tmp_length);
|
|
482
|
+
UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize());
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// This is for when num_buffers_ > 1.
|
|
425
487
|
// If data is overlapping between two buffers then during this call:
|
|
426
488
|
// - data from first buffer is copied into overlapping buffer,
|
|
427
489
|
// - first is removed from bufs_ and freed so that it can be used for async
|
|
428
490
|
// prefetching of further data.
|
|
429
|
-
Status FilePrefetchBuffer::
|
|
491
|
+
Status FilePrefetchBuffer::HandleOverlappingAsyncData(
|
|
430
492
|
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
|
431
493
|
size_t length, size_t readahead_size, bool& copy_to_overlap_buffer,
|
|
432
494
|
uint64_t& tmp_offset, size_t& tmp_length) {
|
|
@@ -436,7 +498,7 @@ Status FilePrefetchBuffer::HandleOverlappingData(
|
|
|
436
498
|
}
|
|
437
499
|
|
|
438
500
|
Status s;
|
|
439
|
-
size_t alignment =
|
|
501
|
+
size_t alignment = GetRequiredBufferAlignment(reader);
|
|
440
502
|
|
|
441
503
|
BufferInfo* buf = GetFirstBuffer();
|
|
442
504
|
|
|
@@ -470,7 +532,7 @@ Status FilePrefetchBuffer::HandleOverlappingData(
|
|
|
470
532
|
overlap_buf_->offset_ = offset;
|
|
471
533
|
copy_to_overlap_buffer = true;
|
|
472
534
|
|
|
473
|
-
|
|
535
|
+
CopyDataToOverlapBuffer(buf, tmp_offset, tmp_length);
|
|
474
536
|
UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize());
|
|
475
537
|
|
|
476
538
|
// Call async prefetching on freed buffer since data has been consumed
|
|
@@ -495,8 +557,8 @@ Status FilePrefetchBuffer::HandleOverlappingData(
|
|
|
495
557
|
uint64_t end_offset = start_offset, aligned_useful_len = 0;
|
|
496
558
|
|
|
497
559
|
ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
|
|
498
|
-
/*refit_tail=*/false,
|
|
499
|
-
alignment,
|
|
560
|
+
/*refit_tail=*/false, /*use_fs_buffer=*/false,
|
|
561
|
+
next_buf->offset_ + second_size, alignment,
|
|
500
562
|
/*length=*/0, readahead_size, start_offset,
|
|
501
563
|
end_offset, read_len, aligned_useful_len);
|
|
502
564
|
if (read_len > 0) {
|
|
@@ -537,7 +599,7 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
|
537
599
|
|
|
538
600
|
TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
|
|
539
601
|
|
|
540
|
-
size_t alignment =
|
|
602
|
+
size_t alignment = GetRequiredBufferAlignment(reader);
|
|
541
603
|
Status s;
|
|
542
604
|
uint64_t tmp_offset = offset;
|
|
543
605
|
size_t tmp_length = length;
|
|
@@ -550,12 +612,20 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
|
550
612
|
}
|
|
551
613
|
ClearOutdatedData(offset, length);
|
|
552
614
|
|
|
553
|
-
// Handle overlapping data over two buffers.
|
|
554
|
-
s =
|
|
555
|
-
|
|
615
|
+
// Handle overlapping data over two buffers (async prefetching case).
|
|
616
|
+
s = HandleOverlappingAsyncData(opts, reader, offset, length, readahead_size,
|
|
617
|
+
copy_to_overlap_buffer, tmp_offset,
|
|
618
|
+
tmp_length);
|
|
556
619
|
if (!s.ok()) {
|
|
557
620
|
return s;
|
|
558
621
|
}
|
|
622
|
+
// Handle partially available data when reusing the file system buffer
|
|
623
|
+
// and num_buffers_ = 1 (sync prefetching case)
|
|
624
|
+
bool use_fs_buffer = UseFSBuffer(reader);
|
|
625
|
+
if (!copy_to_overlap_buffer && use_fs_buffer) {
|
|
626
|
+
HandleOverlappingSyncData(offset, length, tmp_offset, tmp_length,
|
|
627
|
+
copy_to_overlap_buffer);
|
|
628
|
+
}
|
|
559
629
|
|
|
560
630
|
AllocateBufferIfEmpty();
|
|
561
631
|
BufferInfo* buf = GetFirstBuffer();
|
|
@@ -586,8 +656,18 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
|
586
656
|
if (copy_to_overlap_buffer) {
|
|
587
657
|
// Data is overlapping i.e. some of the data has been copied to overlap
|
|
588
658
|
// buffer and remaining will be updated below.
|
|
659
|
+
// Note: why do we not end up performing a duplicate copy when we already
|
|
660
|
+
// copy to the overlap buffer in HandleOverlappingAsyncData /
|
|
661
|
+
// HandleOverlappingSyncData? The reason is that when we call
|
|
662
|
+
// CopyDataToOverlapBuffer, if the buffer is only a "partial hit", then we
|
|
663
|
+
// clear it out since it does not have any more useful data once we copy
|
|
664
|
+
// to the overlap buffer. Once we reallocate a fresh buffer, that buffer
|
|
665
|
+
// will have no data, and it will be the "first" buffer when num_buffers_
|
|
666
|
+
// = 1. When num_buffers_ > 1, we call ClearOutdatedData() so we know
|
|
667
|
+
// that, if we get to this point in the control flow, the "front" buffer
|
|
668
|
+
// has to have the data we need.
|
|
589
669
|
size_t initial_buf_size = overlap_buf_->CurrentSize();
|
|
590
|
-
|
|
670
|
+
CopyDataToOverlapBuffer(buf, offset, length);
|
|
591
671
|
UpdateStats(
|
|
592
672
|
/*found_in_buffer=*/false,
|
|
593
673
|
overlap_buf_->CurrentSize() - initial_buf_size);
|
|
@@ -636,10 +716,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
|
636
716
|
UpdateStats(/*found_in_buffer=*/false,
|
|
637
717
|
(buf->offset_ + buf->CurrentSize() - offset));
|
|
638
718
|
}
|
|
639
|
-
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail
|
|
640
|
-
true,
|
|
641
|
-
|
|
642
|
-
aligned_useful_len1);
|
|
719
|
+
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/
|
|
720
|
+
true, /*use_fs_buffer=*/use_fs_buffer, start_offset1,
|
|
721
|
+
alignment, length, readahead_size, start_offset1,
|
|
722
|
+
end_offset1, read_len1, aligned_useful_len1);
|
|
643
723
|
} else {
|
|
644
724
|
UpdateStats(/*found_in_buffer=*/true, original_length);
|
|
645
725
|
}
|
|
@@ -654,7 +734,8 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
|
654
734
|
}
|
|
655
735
|
|
|
656
736
|
if (read_len1 > 0) {
|
|
657
|
-
s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1
|
|
737
|
+
s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1,
|
|
738
|
+
use_fs_buffer);
|
|
658
739
|
if (!s.ok()) {
|
|
659
740
|
AbortAllIOs();
|
|
660
741
|
FreeAllBuffers();
|
|
@@ -662,10 +743,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
|
662
743
|
}
|
|
663
744
|
}
|
|
664
745
|
|
|
665
|
-
// Copy remaining requested bytes to
|
|
666
|
-
// as data is prefetched during this call.
|
|
746
|
+
// Copy remaining requested bytes to overlap_buf_. No need to
|
|
747
|
+
// update stats as data is prefetched during this call.
|
|
667
748
|
if (copy_to_overlap_buffer && length > 0) {
|
|
668
|
-
|
|
749
|
+
CopyDataToOverlapBuffer(buf, offset, length);
|
|
669
750
|
}
|
|
670
751
|
return s;
|
|
671
752
|
}
|
|
@@ -782,6 +863,7 @@ bool FilePrefetchBuffer::TryReadFromCacheUntracked(
|
|
|
782
863
|
if (copy_to_overlap_buffer) {
|
|
783
864
|
buf = overlap_buf_;
|
|
784
865
|
}
|
|
866
|
+
assert(buf->offset_ <= offset);
|
|
785
867
|
uint64_t offset_in_buffer = offset - buf->offset_;
|
|
786
868
|
*result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n);
|
|
787
869
|
if (prefetched) {
|
|
@@ -892,7 +974,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
|
892
974
|
std::string msg;
|
|
893
975
|
|
|
894
976
|
Status s;
|
|
895
|
-
size_t alignment =
|
|
977
|
+
size_t alignment = GetRequiredBufferAlignment(reader);
|
|
896
978
|
size_t readahead_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
|
|
897
979
|
size_t offset_to_read = static_cast<size_t>(offset);
|
|
898
980
|
uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0;
|
|
@@ -915,6 +997,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
|
915
997
|
// Prefetch full data + readahead_size in the first buffer.
|
|
916
998
|
if (is_eligible_for_prefetching || reader->use_direct_io()) {
|
|
917
999
|
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/false,
|
|
1000
|
+
/*use_fs_buffer=*/false,
|
|
918
1001
|
/*prev_buf_end_offset=*/start_offset1, alignment, n,
|
|
919
1002
|
readahead_size, start_offset1, end_offset1, read_len1,
|
|
920
1003
|
aligned_useful_len1);
|
|
@@ -923,7 +1006,8 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
|
923
1006
|
start_offset1 = offset_to_read;
|
|
924
1007
|
end_offset1 = offset_to_read + n;
|
|
925
1008
|
roundup_len1 = end_offset1 - start_offset1;
|
|
926
|
-
PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1,
|
|
1009
|
+
PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1,
|
|
1010
|
+
/*refit_tail=*/false, /*use_fs_buffer=*/false,
|
|
927
1011
|
aligned_useful_len1);
|
|
928
1012
|
assert(aligned_useful_len1 == 0);
|
|
929
1013
|
assert(roundup_len1 >= aligned_useful_len1);
|
|
@@ -970,7 +1054,7 @@ Status FilePrefetchBuffer::PrefetchRemBuffers(const IOOptions& opts,
|
|
|
970
1054
|
uint64_t end_offset2 = start_offset2, aligned_useful_len2 = 0;
|
|
971
1055
|
size_t read_len2 = 0;
|
|
972
1056
|
ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
|
|
973
|
-
/*refit_tail=*/false,
|
|
1057
|
+
/*refit_tail=*/false, /*use_fs_buffer=*/false,
|
|
974
1058
|
/*prev_buf_end_offset=*/end_offset1, alignment,
|
|
975
1059
|
/*length=*/0, readahead_size, start_offset2,
|
|
976
1060
|
end_offset2, read_len2, aligned_useful_len2);
|