@nxtedition/rocksdb 6.0.2 → 7.0.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/BUILDING.md +12 -4
- package/binding.cc +589 -128
- package/chained-batch.js +6 -6
- package/deps/rocksdb/rocksdb/CMakeLists.txt +9 -0
- package/deps/rocksdb/rocksdb/Makefile +16 -5
- package/deps/rocksdb/rocksdb/TARGETS +23 -2
- package/deps/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake +7 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindJeMalloc.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindNUMA.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindTBB.cmake +33 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Findgflags.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Findlz4.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Finduring.cmake +26 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Findzstd.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/ReadVersion.cmake +10 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +12 -4
- package/deps/rocksdb/rocksdb/db/c.cc +26 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +3 -0
- package/deps/rocksdb/rocksdb/db/column_family.cc +8 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +29 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +8 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +16 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +16 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +402 -30
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +2 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +14 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +7 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +33 -7
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +54 -23
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +3 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +8 -1
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +16 -0
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +14 -15
- package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +331 -0
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +5 -0
- package/deps/rocksdb/rocksdb/db/db_test.cc +16 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +221 -92
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +6 -2
- package/deps/rocksdb/rocksdb/db/db_test_util.h +4 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +1 -171
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_test_util.cc +96 -0
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_test_util.h +126 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +57 -0
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -2
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +2 -0
- package/deps/rocksdb/rocksdb/db/flush_job.cc +10 -11
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +11 -1
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +6 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +12 -1
- package/deps/rocksdb/rocksdb/db/repair_test.cc +32 -10
- package/deps/rocksdb/rocksdb/db/snapshot_impl.h +3 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +19 -127
- package/deps/rocksdb/rocksdb/db/table_cache.h +3 -2
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +140 -0
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +130 -128
- package/deps/rocksdb/rocksdb/db/version_edit.cc +20 -0
- package/deps/rocksdb/rocksdb/db/version_edit.h +13 -4
- package/deps/rocksdb/rocksdb/db/version_edit_test.cc +14 -14
- package/deps/rocksdb/rocksdb/db/version_set.cc +205 -212
- package/deps/rocksdb/rocksdb/db/version_set.h +11 -0
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +154 -0
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +10 -9
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +15 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +159 -65
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +43 -21
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +142 -17
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +23 -27
- package/deps/rocksdb/rocksdb/file/writable_file_writer.h +2 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +23 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +17 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/snapshot.h +4 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +189 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +1 -1
- package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -0
- package/deps/rocksdb/rocksdb/options/db_options.cc +8 -0
- package/deps/rocksdb/rocksdb/options/db_options.h +1 -0
- package/deps/rocksdb/rocksdb/options/options.cc +7 -0
- package/deps/rocksdb/rocksdb/options/options_helper.cc +4 -0
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +6 -4
- package/deps/rocksdb/rocksdb/options/options_test.cc +107 -9
- package/deps/rocksdb/rocksdb/src.mk +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +9 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +80 -6
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +8 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +81 -757
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +21 -15
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +9 -3
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +754 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +8 -0
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -10
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +59 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +18 -0
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +0 -61
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +0 -13
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +2 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +60 -2
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +2 -0
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +39 -0
- package/deps/rocksdb/rocksdb/table/multiget_context.h +46 -2
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +2 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +1 -1
- package/deps/rocksdb/rocksdb/table/table_reader.h +13 -0
- package/deps/rocksdb/rocksdb/table/unique_id.cc +27 -0
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +3 -0
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +23 -7
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +9 -1
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +72 -0
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +144 -0
- package/deps/rocksdb/rocksdb/util/compression.h +49 -0
- package/deps/rocksdb/rocksdb/util/coro_utils.h +111 -0
- package/deps/rocksdb/rocksdb/util/single_thread_executor.h +55 -0
- package/deps/rocksdb/rocksdb.gyp +16 -15
- package/index.js +186 -3
- package/iterator.js +1 -0
- package/package-lock.json +23687 -0
- package/package.json +2 -30
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/deps/liburing/liburing/README +0 -46
- package/deps/liburing/liburing/test/232c93d07b74-test.c +0 -305
- package/deps/liburing/liburing/test/35fa71a030ca-test.c +0 -329
- package/deps/liburing/liburing/test/500f9fbadef8-test.c +0 -89
- package/deps/liburing/liburing/test/7ad0e4b2f83c-test.c +0 -93
- package/deps/liburing/liburing/test/8a9973408177-test.c +0 -106
- package/deps/liburing/liburing/test/917257daa0fe-test.c +0 -53
- package/deps/liburing/liburing/test/Makefile +0 -312
- package/deps/liburing/liburing/test/a0908ae19763-test.c +0 -58
- package/deps/liburing/liburing/test/a4c0b3decb33-test.c +0 -180
- package/deps/liburing/liburing/test/accept-link.c +0 -251
- package/deps/liburing/liburing/test/accept-reuse.c +0 -164
- package/deps/liburing/liburing/test/accept-test.c +0 -79
- package/deps/liburing/liburing/test/accept.c +0 -476
- package/deps/liburing/liburing/test/across-fork.c +0 -283
- package/deps/liburing/liburing/test/b19062a56726-test.c +0 -53
- package/deps/liburing/liburing/test/b5837bd5311d-test.c +0 -77
- package/deps/liburing/liburing/test/ce593a6c480a-test.c +0 -135
- package/deps/liburing/liburing/test/close-opath.c +0 -122
- package/deps/liburing/liburing/test/config +0 -10
- package/deps/liburing/liburing/test/connect.c +0 -398
- package/deps/liburing/liburing/test/cq-full.c +0 -96
- package/deps/liburing/liburing/test/cq-overflow.c +0 -294
- package/deps/liburing/liburing/test/cq-peek-batch.c +0 -102
- package/deps/liburing/liburing/test/cq-ready.c +0 -94
- package/deps/liburing/liburing/test/cq-size.c +0 -58
- package/deps/liburing/liburing/test/d4ae271dfaae-test.c +0 -96
- package/deps/liburing/liburing/test/d77a67ed5f27-test.c +0 -65
- package/deps/liburing/liburing/test/defer.c +0 -307
- package/deps/liburing/liburing/test/double-poll-crash.c +0 -186
- package/deps/liburing/liburing/test/eeed8b54e0df-test.c +0 -114
- package/deps/liburing/liburing/test/empty-eownerdead.c +0 -42
- package/deps/liburing/liburing/test/eventfd-disable.c +0 -151
- package/deps/liburing/liburing/test/eventfd-ring.c +0 -97
- package/deps/liburing/liburing/test/eventfd.c +0 -112
- package/deps/liburing/liburing/test/fadvise.c +0 -202
- package/deps/liburing/liburing/test/fallocate.c +0 -249
- package/deps/liburing/liburing/test/fc2a85cb02ef-test.c +0 -138
- package/deps/liburing/liburing/test/file-register.c +0 -843
- package/deps/liburing/liburing/test/file-update.c +0 -173
- package/deps/liburing/liburing/test/files-exit-hang-poll.c +0 -128
- package/deps/liburing/liburing/test/files-exit-hang-timeout.c +0 -134
- package/deps/liburing/liburing/test/fixed-link.c +0 -90
- package/deps/liburing/liburing/test/fsync.c +0 -224
- package/deps/liburing/liburing/test/hardlink.c +0 -136
- package/deps/liburing/liburing/test/helpers.c +0 -135
- package/deps/liburing/liburing/test/helpers.h +0 -67
- package/deps/liburing/liburing/test/io-cancel.c +0 -537
- package/deps/liburing/liburing/test/io_uring_enter.c +0 -296
- package/deps/liburing/liburing/test/io_uring_register.c +0 -664
- package/deps/liburing/liburing/test/io_uring_setup.c +0 -192
- package/deps/liburing/liburing/test/iopoll.c +0 -366
- package/deps/liburing/liburing/test/lfs-openat-write.c +0 -117
- package/deps/liburing/liburing/test/lfs-openat.c +0 -273
- package/deps/liburing/liburing/test/link-timeout.c +0 -1107
- package/deps/liburing/liburing/test/link.c +0 -496
- package/deps/liburing/liburing/test/link_drain.c +0 -229
- package/deps/liburing/liburing/test/madvise.c +0 -195
- package/deps/liburing/liburing/test/mkdir.c +0 -108
- package/deps/liburing/liburing/test/multicqes_drain.c +0 -383
- package/deps/liburing/liburing/test/nop-all-sizes.c +0 -107
- package/deps/liburing/liburing/test/nop.c +0 -115
- package/deps/liburing/liburing/test/open-close.c +0 -146
- package/deps/liburing/liburing/test/openat2.c +0 -240
- package/deps/liburing/liburing/test/personality.c +0 -204
- package/deps/liburing/liburing/test/pipe-eof.c +0 -81
- package/deps/liburing/liburing/test/pipe-reuse.c +0 -105
- package/deps/liburing/liburing/test/poll-cancel-ton.c +0 -139
- package/deps/liburing/liburing/test/poll-cancel.c +0 -135
- package/deps/liburing/liburing/test/poll-link.c +0 -227
- package/deps/liburing/liburing/test/poll-many.c +0 -208
- package/deps/liburing/liburing/test/poll-mshot-update.c +0 -273
- package/deps/liburing/liburing/test/poll-ring.c +0 -48
- package/deps/liburing/liburing/test/poll-v-poll.c +0 -353
- package/deps/liburing/liburing/test/poll.c +0 -109
- package/deps/liburing/liburing/test/probe.c +0 -137
- package/deps/liburing/liburing/test/read-write.c +0 -876
- package/deps/liburing/liburing/test/register-restrictions.c +0 -633
- package/deps/liburing/liburing/test/rename.c +0 -134
- package/deps/liburing/liburing/test/ring-leak.c +0 -173
- package/deps/liburing/liburing/test/ring-leak2.c +0 -249
- package/deps/liburing/liburing/test/rsrc_tags.c +0 -449
- package/deps/liburing/liburing/test/runtests-loop.sh +0 -16
- package/deps/liburing/liburing/test/runtests.sh +0 -170
- package/deps/liburing/liburing/test/rw_merge_test.c +0 -97
- package/deps/liburing/liburing/test/self.c +0 -91
- package/deps/liburing/liburing/test/send_recv.c +0 -291
- package/deps/liburing/liburing/test/send_recvmsg.c +0 -345
- package/deps/liburing/liburing/test/sendmsg_fs_cve.c +0 -198
- package/deps/liburing/liburing/test/shared-wq.c +0 -84
- package/deps/liburing/liburing/test/short-read.c +0 -75
- package/deps/liburing/liburing/test/shutdown.c +0 -163
- package/deps/liburing/liburing/test/sigfd-deadlock.c +0 -74
- package/deps/liburing/liburing/test/socket-rw-eagain.c +0 -156
- package/deps/liburing/liburing/test/socket-rw.c +0 -147
- package/deps/liburing/liburing/test/splice.c +0 -511
- package/deps/liburing/liburing/test/sq-full-cpp.cc +0 -45
- package/deps/liburing/liburing/test/sq-full.c +0 -45
- package/deps/liburing/liburing/test/sq-poll-dup.c +0 -200
- package/deps/liburing/liburing/test/sq-poll-kthread.c +0 -168
- package/deps/liburing/liburing/test/sq-poll-share.c +0 -137
- package/deps/liburing/liburing/test/sq-space_left.c +0 -159
- package/deps/liburing/liburing/test/sqpoll-cancel-hang.c +0 -159
- package/deps/liburing/liburing/test/sqpoll-disable-exit.c +0 -195
- package/deps/liburing/liburing/test/sqpoll-exit-hang.c +0 -77
- package/deps/liburing/liburing/test/sqpoll-sleep.c +0 -68
- package/deps/liburing/liburing/test/statx.c +0 -172
- package/deps/liburing/liburing/test/stdout.c +0 -232
- package/deps/liburing/liburing/test/submit-link-fail.c +0 -154
- package/deps/liburing/liburing/test/submit-reuse.c +0 -239
- package/deps/liburing/liburing/test/symlink.c +0 -116
- package/deps/liburing/liburing/test/teardowns.c +0 -58
- package/deps/liburing/liburing/test/thread-exit.c +0 -131
- package/deps/liburing/liburing/test/timeout-new.c +0 -246
- package/deps/liburing/liburing/test/timeout-overflow.c +0 -204
- package/deps/liburing/liburing/test/timeout.c +0 -1354
- package/deps/liburing/liburing/test/unlink.c +0 -111
- package/deps/liburing/liburing/test/wakeup-hang.c +0 -162
- package/deps/rocksdb/rocksdb/README.md +0 -32
- package/deps/rocksdb/rocksdb/microbench/README.md +0 -60
- package/deps/rocksdb/rocksdb/plugin/README.md +0 -43
- package/deps/rocksdb/rocksdb/port/README +0 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +0 -13
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
|
|
2
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
3
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
5
|
+
|
|
6
|
+
#include "util/coro_utils.h"
|
|
7
|
+
|
|
8
|
+
#if defined(WITHOUT_COROUTINES) || \
|
|
9
|
+
(defined(USE_COROUTINES) && defined(WITH_COROUTINES))
|
|
10
|
+
|
|
11
|
+
namespace ROCKSDB_NAMESPACE {
|
|
12
|
+
|
|
13
|
+
// Lookup a batch of keys in a single SST file
|
|
14
|
+
DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
|
|
15
|
+
(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
|
|
16
|
+
bool is_hit_file_last_in_level, FdWithKeyRange* f,
|
|
17
|
+
std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
|
|
18
|
+
uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_data_read,
|
|
19
|
+
uint64_t& num_sst_read) {
|
|
20
|
+
bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
|
|
21
|
+
get_perf_context()->per_level_perf_context_enabled;
|
|
22
|
+
|
|
23
|
+
Status s;
|
|
24
|
+
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
|
|
25
|
+
s = CO_AWAIT(table_cache_->MultiGet)(
|
|
26
|
+
read_options, *internal_comparator(), *f->file_metadata, &file_range,
|
|
27
|
+
mutable_cf_options_.prefix_extractor,
|
|
28
|
+
cfd_->internal_stats()->GetFileReadHist(hit_file_level),
|
|
29
|
+
IsFilterSkipped(static_cast<int>(hit_file_level),
|
|
30
|
+
is_hit_file_last_in_level),
|
|
31
|
+
hit_file_level);
|
|
32
|
+
// TODO: examine the behavior for corrupted key
|
|
33
|
+
if (timer_enabled) {
|
|
34
|
+
PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
|
|
35
|
+
hit_file_level);
|
|
36
|
+
}
|
|
37
|
+
if (!s.ok()) {
|
|
38
|
+
// TODO: Set status for individual keys appropriately
|
|
39
|
+
for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
|
|
40
|
+
*iter->s = s;
|
|
41
|
+
file_range.MarkKeyDone(iter);
|
|
42
|
+
}
|
|
43
|
+
CO_RETURN s;
|
|
44
|
+
}
|
|
45
|
+
uint64_t batch_size = 0;
|
|
46
|
+
for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
|
|
47
|
+
++iter) {
|
|
48
|
+
GetContext& get_context = *iter->get_context;
|
|
49
|
+
Status* status = iter->s;
|
|
50
|
+
// The Status in the KeyContext takes precedence over GetContext state
|
|
51
|
+
// Status may be an error if there were any IO errors in the table
|
|
52
|
+
// reader. We never expect Status to be NotFound(), as that is
|
|
53
|
+
// determined by get_context
|
|
54
|
+
assert(!status->IsNotFound());
|
|
55
|
+
if (!status->ok()) {
|
|
56
|
+
file_range.MarkKeyDone(iter);
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if (get_context.sample()) {
|
|
61
|
+
sample_file_read_inc(f->file_metadata);
|
|
62
|
+
}
|
|
63
|
+
batch_size++;
|
|
64
|
+
num_index_read += get_context.get_context_stats_.num_index_read;
|
|
65
|
+
num_filter_read += get_context.get_context_stats_.num_filter_read;
|
|
66
|
+
num_data_read += get_context.get_context_stats_.num_data_read;
|
|
67
|
+
num_sst_read += get_context.get_context_stats_.num_sst_read;
|
|
68
|
+
// Reset these stats since they're specific to a level
|
|
69
|
+
get_context.get_context_stats_.num_index_read = 0;
|
|
70
|
+
get_context.get_context_stats_.num_filter_read = 0;
|
|
71
|
+
get_context.get_context_stats_.num_data_read = 0;
|
|
72
|
+
get_context.get_context_stats_.num_sst_read = 0;
|
|
73
|
+
|
|
74
|
+
// report the counters before returning
|
|
75
|
+
if (get_context.State() != GetContext::kNotFound &&
|
|
76
|
+
get_context.State() != GetContext::kMerge &&
|
|
77
|
+
db_statistics_ != nullptr) {
|
|
78
|
+
get_context.ReportCounters();
|
|
79
|
+
} else {
|
|
80
|
+
if (iter->max_covering_tombstone_seq > 0) {
|
|
81
|
+
// The remaining files we look at will only contain covered keys, so
|
|
82
|
+
// we stop here for this key
|
|
83
|
+
file_range.SkipKey(iter);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
switch (get_context.State()) {
|
|
87
|
+
case GetContext::kNotFound:
|
|
88
|
+
// Keep searching in other files
|
|
89
|
+
break;
|
|
90
|
+
case GetContext::kMerge:
|
|
91
|
+
// TODO: update per-level perfcontext user_key_return_count for kMerge
|
|
92
|
+
break;
|
|
93
|
+
case GetContext::kFound:
|
|
94
|
+
if (hit_file_level == 0) {
|
|
95
|
+
RecordTick(db_statistics_, GET_HIT_L0);
|
|
96
|
+
} else if (hit_file_level == 1) {
|
|
97
|
+
RecordTick(db_statistics_, GET_HIT_L1);
|
|
98
|
+
} else if (hit_file_level >= 2) {
|
|
99
|
+
RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level);
|
|
103
|
+
|
|
104
|
+
file_range.MarkKeyDone(iter);
|
|
105
|
+
|
|
106
|
+
if (iter->is_blob_index) {
|
|
107
|
+
if (iter->value) {
|
|
108
|
+
TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
|
|
109
|
+
&(*iter));
|
|
110
|
+
|
|
111
|
+
const Slice& blob_index_slice = *(iter->value);
|
|
112
|
+
BlobIndex blob_index;
|
|
113
|
+
Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
|
|
114
|
+
if (tmp_s.ok()) {
|
|
115
|
+
const uint64_t blob_file_num = blob_index.file_number();
|
|
116
|
+
blob_rqs[blob_file_num].emplace_back(
|
|
117
|
+
std::make_pair(blob_index, std::cref(*iter)));
|
|
118
|
+
} else {
|
|
119
|
+
*(iter->s) = tmp_s;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
} else {
|
|
123
|
+
file_range.AddValueSize(iter->value->size());
|
|
124
|
+
if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
|
|
125
|
+
s = Status::Aborted();
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
continue;
|
|
130
|
+
case GetContext::kDeleted:
|
|
131
|
+
// Use empty error message for speed
|
|
132
|
+
*status = Status::NotFound();
|
|
133
|
+
file_range.MarkKeyDone(iter);
|
|
134
|
+
continue;
|
|
135
|
+
case GetContext::kCorrupt:
|
|
136
|
+
*status =
|
|
137
|
+
Status::Corruption("corrupted key for ", iter->lkey->user_key());
|
|
138
|
+
file_range.MarkKeyDone(iter);
|
|
139
|
+
continue;
|
|
140
|
+
case GetContext::kUnexpectedBlobIndex:
|
|
141
|
+
ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
|
|
142
|
+
*status = Status::NotSupported(
|
|
143
|
+
"Encounter unexpected blob index. Please open DB with "
|
|
144
|
+
"ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
|
|
145
|
+
file_range.MarkKeyDone(iter);
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
|
|
151
|
+
CO_RETURN s;
|
|
152
|
+
}
|
|
153
|
+
} // namespace ROCKSDB_NAMESPACE
|
|
154
|
+
#endif
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
#include "rocksdb/file_system.h"
|
|
19
19
|
#include "table/block_based/block_based_table_factory.h"
|
|
20
20
|
#include "table/mock_table.h"
|
|
21
|
+
#include "table/unique_id_impl.h"
|
|
21
22
|
#include "test_util/testharness.h"
|
|
22
23
|
#include "test_util/testutil.h"
|
|
23
24
|
#include "util/string_util.h"
|
|
@@ -49,7 +50,7 @@ class GenerateLevelFilesBriefTest : public testing::Test {
|
|
|
49
50
|
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
|
|
50
51
|
kUnknownFileCreationTime, kUnknownFileChecksum,
|
|
51
52
|
kUnknownFileChecksumFuncName, kDisableUserTimestamp,
|
|
52
|
-
kDisableUserTimestamp);
|
|
53
|
+
kDisableUserTimestamp, kNullUniqueId64x2);
|
|
53
54
|
files_.push_back(f);
|
|
54
55
|
}
|
|
55
56
|
|
|
@@ -158,7 +159,7 @@ class VersionStorageInfoTestBase : public testing::Test {
|
|
|
158
159
|
Temperature::kUnknown, oldest_blob_file_number,
|
|
159
160
|
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
|
|
160
161
|
kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
|
161
|
-
kDisableUserTimestamp, kDisableUserTimestamp);
|
|
162
|
+
kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
|
|
162
163
|
f->compensated_file_size = file_size;
|
|
163
164
|
vstorage_.AddFile(level, f);
|
|
164
165
|
}
|
|
@@ -3222,11 +3223,11 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
|
|
|
3222
3223
|
s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
|
|
3223
3224
|
ASSERT_OK(s);
|
|
3224
3225
|
ASSERT_NE(0, file_size);
|
|
3225
|
-
file_metas->emplace_back(
|
|
3226
|
-
|
|
3227
|
-
|
|
3228
|
-
|
|
3229
|
-
|
|
3226
|
+
file_metas->emplace_back(
|
|
3227
|
+
file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false,
|
|
3228
|
+
Temperature::kUnknown, 0, 0, 0, kUnknownFileChecksum,
|
|
3229
|
+
kUnknownFileChecksumFuncName, kDisableUserTimestamp,
|
|
3230
|
+
kDisableUserTimestamp, kNullUniqueId64x2);
|
|
3230
3231
|
}
|
|
3231
3232
|
}
|
|
3232
3233
|
|
|
@@ -3282,7 +3283,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
|
|
|
3282
3283
|
file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
|
|
3283
3284
|
largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
|
|
3284
3285
|
kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
|
3285
|
-
kDisableUserTimestamp, kDisableUserTimestamp);
|
|
3286
|
+
kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
|
|
3286
3287
|
added_files.emplace_back(0, meta);
|
|
3287
3288
|
}
|
|
3288
3289
|
WriteFileAdditionAndDeletionToManifest(
|
|
@@ -3338,7 +3339,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
|
|
|
3338
3339
|
file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
|
|
3339
3340
|
largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
|
|
3340
3341
|
kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
|
3341
|
-
kDisableUserTimestamp, kDisableUserTimestamp);
|
|
3342
|
+
kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
|
|
3342
3343
|
added_files.emplace_back(0, meta);
|
|
3343
3344
|
}
|
|
3344
3345
|
WriteFileAdditionAndDeletionToManifest(
|
|
@@ -221,6 +221,7 @@ DECLARE_int32(compression_max_dict_bytes);
|
|
|
221
221
|
DECLARE_int32(compression_zstd_max_train_bytes);
|
|
222
222
|
DECLARE_int32(compression_parallel_threads);
|
|
223
223
|
DECLARE_uint64(compression_max_dict_buffer_bytes);
|
|
224
|
+
DECLARE_bool(compression_use_zstd_dict_trainer);
|
|
224
225
|
DECLARE_string(checksum_type);
|
|
225
226
|
DECLARE_string(env_uri);
|
|
226
227
|
DECLARE_string(fs_uri);
|
|
@@ -292,6 +293,7 @@ DECLARE_uint64(wp_commit_cache_bits);
|
|
|
292
293
|
DECLARE_bool(adaptive_readahead);
|
|
293
294
|
DECLARE_bool(async_io);
|
|
294
295
|
DECLARE_string(wal_compression);
|
|
296
|
+
DECLARE_bool(verify_sst_unique_id_in_manifest);
|
|
295
297
|
|
|
296
298
|
constexpr long KB = 1024;
|
|
297
299
|
constexpr int kRandomValueMaxFactor = 3;
|
|
@@ -752,6 +752,13 @@ DEFINE_uint64(compression_max_dict_buffer_bytes, 0,
|
|
|
752
752
|
"Buffering limit for SST file data to sample for dictionary "
|
|
753
753
|
"compression.");
|
|
754
754
|
|
|
755
|
+
DEFINE_bool(
|
|
756
|
+
compression_use_zstd_dict_trainer, true,
|
|
757
|
+
"Use zstd's trainer to generate dictionary. If the options is false, "
|
|
758
|
+
"zstd's finalizeDictionary() API is used to generate dictionary. "
|
|
759
|
+
"ZSTD 1.4.5+ is required. If ZSTD 1.4.5+ is not linked with the binary, "
|
|
760
|
+
"this flag will have the default value true.");
|
|
761
|
+
|
|
755
762
|
DEFINE_string(bottommost_compression_type, "disable",
|
|
756
763
|
"Algorithm to use to compress bottommost level of the database. "
|
|
757
764
|
"\"disable\" means disabling the feature");
|
|
@@ -952,4 +959,10 @@ DEFINE_bool(
|
|
|
952
959
|
DEFINE_string(wal_compression, "none",
|
|
953
960
|
"Algorithm to use for WAL compression. none to disable.");
|
|
954
961
|
|
|
962
|
+
DEFINE_bool(
|
|
963
|
+
verify_sst_unique_id_in_manifest, false,
|
|
964
|
+
"Enable DB options `verify_sst_unique_id_in_manifest`, if true, during "
|
|
965
|
+
"DB-open try verifying the SST unique id between MANIFEST and SST "
|
|
966
|
+
"properties.");
|
|
967
|
+
|
|
955
968
|
#endif // GFLAGS
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
9
9
|
//
|
|
10
10
|
|
|
11
|
+
#include "util/compression.h"
|
|
11
12
|
#ifdef GFLAGS
|
|
12
13
|
#include "db_stress_tool/db_stress_common.h"
|
|
13
14
|
#include "db_stress_tool/db_stress_compaction_filter.h"
|
|
@@ -2315,6 +2316,8 @@ void StressTest::PrintEnv() const {
|
|
|
2315
2316
|
static_cast<int>(FLAGS_user_timestamp_size));
|
|
2316
2317
|
fprintf(stdout, "WAL compression : %s\n",
|
|
2317
2318
|
FLAGS_wal_compression.c_str());
|
|
2319
|
+
fprintf(stdout, "Try verify sst unique id : %d\n",
|
|
2320
|
+
static_cast<int>(FLAGS_verify_sst_unique_id_in_manifest));
|
|
2318
2321
|
|
|
2319
2322
|
fprintf(stdout, "------------------------------------------------\n");
|
|
2320
2323
|
}
|
|
@@ -2913,6 +2916,16 @@ void InitializeOptionsFromFlags(
|
|
|
2913
2916
|
FLAGS_compression_parallel_threads;
|
|
2914
2917
|
options.compression_opts.max_dict_buffer_bytes =
|
|
2915
2918
|
FLAGS_compression_max_dict_buffer_bytes;
|
|
2919
|
+
if (ZSTD_FinalizeDictionarySupported()) {
|
|
2920
|
+
options.compression_opts.use_zstd_dict_trainer =
|
|
2921
|
+
FLAGS_compression_use_zstd_dict_trainer;
|
|
2922
|
+
} else if (!FLAGS_compression_use_zstd_dict_trainer) {
|
|
2923
|
+
fprintf(
|
|
2924
|
+
stderr,
|
|
2925
|
+
"WARNING: use_zstd_dict_trainer is false but zstd finalizeDictionary "
|
|
2926
|
+
"cannot be used because ZSTD 1.4.5+ is not linked with the binary."
|
|
2927
|
+
" zstd dictionary trainer will be used.\n");
|
|
2928
|
+
}
|
|
2916
2929
|
options.max_manifest_file_size = FLAGS_max_manifest_file_size;
|
|
2917
2930
|
options.inplace_update_support = FLAGS_in_place_update;
|
|
2918
2931
|
options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
|
|
@@ -2941,6 +2954,8 @@ void InitializeOptionsFromFlags(
|
|
|
2941
2954
|
options.level_compaction_dynamic_level_bytes =
|
|
2942
2955
|
FLAGS_level_compaction_dynamic_level_bytes;
|
|
2943
2956
|
options.track_and_verify_wals_in_manifest = true;
|
|
2957
|
+
options.verify_sst_unique_id_in_manifest =
|
|
2958
|
+
FLAGS_verify_sst_unique_id_in_manifest;
|
|
2944
2959
|
|
|
2945
2960
|
// Integrated BlobDB
|
|
2946
2961
|
options.enable_blob_files = FLAGS_enable_blob_files;
|
|
@@ -194,34 +194,7 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
|
|
|
194
194
|
}
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
-
|
|
198
|
-
// async_read is enabled in case of sequential reads. So when
|
|
199
|
-
// buffers are switched, we clear the curr_ buffer as we assume the data has
|
|
200
|
-
// been consumed because of sequential reads.
|
|
201
|
-
//
|
|
202
|
-
// Scenarios for prefetching asynchronously:
|
|
203
|
-
// Case1: If both buffers are empty, prefetch n bytes
|
|
204
|
-
// synchronously in curr_
|
|
205
|
-
// and prefetch readahead_size_/2 async in second buffer.
|
|
206
|
-
// Case2: If second buffer has partial or full data, make it current and
|
|
207
|
-
// prefetch readahead_size_/2 async in second buffer. In case of
|
|
208
|
-
// partial data, prefetch remaining bytes from size n synchronously to
|
|
209
|
-
// fulfill the requested bytes request.
|
|
210
|
-
// Case3: If curr_ has partial data, prefetch remaining bytes from size n
|
|
211
|
-
// synchronously in curr_ to fulfill the requested bytes request and
|
|
212
|
-
// prefetch readahead_size_/2 bytes async in second buffer.
|
|
213
|
-
// Case4: If data is in both buffers, copy requested data from curr_ and second
|
|
214
|
-
// buffer to third buffer. If all requested bytes have been copied, do
|
|
215
|
-
// the asynchronous prefetching in second buffer.
|
|
216
|
-
Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
217
|
-
RandomAccessFileReader* reader,
|
|
218
|
-
uint64_t offset, size_t length,
|
|
219
|
-
size_t readahead_size,
|
|
220
|
-
Env::IOPriority rate_limiter_priority,
|
|
221
|
-
bool& copy_to_third_buffer) {
|
|
222
|
-
if (!enable_) {
|
|
223
|
-
return Status::OK();
|
|
224
|
-
}
|
|
197
|
+
void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
|
|
225
198
|
if (async_read_in_progress_ && fs_ != nullptr) {
|
|
226
199
|
// Wait for prefetch data to complete.
|
|
227
200
|
// No mutex is needed as PrefetchAsyncCallback updates the result in second
|
|
@@ -242,11 +215,6 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
|
242
215
|
del_fn_ = nullptr;
|
|
243
216
|
}
|
|
244
217
|
|
|
245
|
-
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
|
|
246
|
-
Status s;
|
|
247
|
-
size_t prefetch_size = length + readahead_size;
|
|
248
|
-
|
|
249
|
-
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
250
218
|
// Index of second buffer.
|
|
251
219
|
uint32_t second = curr_ ^ 1;
|
|
252
220
|
|
|
@@ -273,17 +241,55 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
|
273
241
|
// outdated data and switch the buffers.
|
|
274
242
|
bufs_[curr_].buffer_.Clear();
|
|
275
243
|
curr_ = curr_ ^ 1;
|
|
276
|
-
second = curr_ ^ 1;
|
|
277
244
|
}
|
|
278
|
-
|
|
279
|
-
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// If async_read = true:
|
|
248
|
+
// async_read is enabled in case of sequential reads. So when
|
|
249
|
+
// buffers are switched, we clear the curr_ buffer as we assume the data has
|
|
250
|
+
// been consumed because of sequential reads.
|
|
251
|
+
//
|
|
252
|
+
// Scenarios for prefetching asynchronously:
|
|
253
|
+
// Case1: If both buffers are empty, prefetch n bytes
|
|
254
|
+
// synchronously in curr_
|
|
255
|
+
// and prefetch readahead_size_/2 async in second buffer.
|
|
256
|
+
// Case2: If second buffer has partial or full data, make it current and
|
|
257
|
+
// prefetch readahead_size_/2 async in second buffer. In case of
|
|
258
|
+
// partial data, prefetch remaining bytes from size n synchronously to
|
|
259
|
+
// fulfill the requested bytes request.
|
|
260
|
+
// Case3: If curr_ has partial data, prefetch remaining bytes from size n
|
|
261
|
+
// synchronously in curr_ to fulfill the requested bytes request and
|
|
262
|
+
// prefetch readahead_size_/2 bytes async in second buffer.
|
|
263
|
+
// Case4: If data is in both buffers, copy requested data from curr_ and second
|
|
264
|
+
// buffer to third buffer. If all requested bytes have been copied, do
|
|
265
|
+
// the asynchronous prefetching in second buffer.
|
|
266
|
+
Status FilePrefetchBuffer::PrefetchAsyncInternal(
|
|
267
|
+
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
|
268
|
+
size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
|
|
269
|
+
bool& copy_to_third_buffer) {
|
|
270
|
+
if (!enable_) {
|
|
271
|
+
return Status::OK();
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
|
|
275
|
+
|
|
276
|
+
PollAndUpdateBuffersIfNeeded(offset);
|
|
277
|
+
|
|
278
|
+
// If all the requested bytes are in curr_, it will go for async prefetching
|
|
279
|
+
// only.
|
|
280
280
|
if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
|
|
281
281
|
offset + length <=
|
|
282
282
|
bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
|
|
283
283
|
offset += length;
|
|
284
284
|
length = 0;
|
|
285
|
-
prefetch_size = readahead_size;
|
|
286
285
|
}
|
|
286
|
+
|
|
287
|
+
Status s;
|
|
288
|
+
size_t prefetch_size = length + readahead_size;
|
|
289
|
+
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
290
|
+
// Index of second buffer.
|
|
291
|
+
uint32_t second = curr_ ^ 1;
|
|
292
|
+
|
|
287
293
|
// Data is overlapping i.e. some of the data is in curr_ buffer and remaining
|
|
288
294
|
// in second buffer.
|
|
289
295
|
if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
|
|
@@ -315,9 +321,8 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
|
315
321
|
prefetch_size = length + readahead_size;
|
|
316
322
|
}
|
|
317
323
|
|
|
318
|
-
// Update second again if swap happened.
|
|
319
|
-
second = curr_ ^ 1;
|
|
320
324
|
size_t _offset = static_cast<size_t>(offset);
|
|
325
|
+
second = curr_ ^ 1;
|
|
321
326
|
|
|
322
327
|
// offset and size alignment for curr_ buffer with synchronous prefetching
|
|
323
328
|
uint64_t rounddown_start1 = Rounddown(_offset, alignment);
|
|
@@ -442,12 +447,23 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
|
|
|
442
447
|
bool FilePrefetchBuffer::TryReadFromCacheAsync(
|
|
443
448
|
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
|
444
449
|
size_t n, Slice* result, Status* status,
|
|
445
|
-
Env::IOPriority rate_limiter_priority
|
|
446
|
-
)
|
|
450
|
+
Env::IOPriority rate_limiter_priority) {
|
|
451
|
+
assert(async_io_);
|
|
452
|
+
|
|
447
453
|
if (track_min_offset_ && offset < min_offset_read_) {
|
|
448
454
|
min_offset_read_ = static_cast<size_t>(offset);
|
|
449
455
|
}
|
|
450
|
-
|
|
456
|
+
|
|
457
|
+
if (!enable_) {
|
|
458
|
+
return false;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// In case of async_io_, offset can be less than bufs_[curr_].offset_ because
|
|
462
|
+
// of reads not sequential and PrefetchAsync can be called for any block and
|
|
463
|
+
// RocksDB will call TryReadFromCacheAsync after PrefetchAsync to Poll for
|
|
464
|
+
// requested bytes.
|
|
465
|
+
if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_ &&
|
|
466
|
+
prev_len_ != 0) {
|
|
451
467
|
return false;
|
|
452
468
|
}
|
|
453
469
|
|
|
@@ -459,35 +475,25 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
|
|
|
459
475
|
// If readahead is not enabled: return false.
|
|
460
476
|
TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
|
|
461
477
|
&readahead_size_);
|
|
462
|
-
if (offset
|
|
478
|
+
if (offset < bufs_[curr_].offset_ ||
|
|
479
|
+
offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
|
|
463
480
|
if (readahead_size_ > 0) {
|
|
464
481
|
Status s;
|
|
465
482
|
assert(reader != nullptr);
|
|
466
483
|
assert(max_readahead_size_ >= readahead_size_);
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
// Ignore status as Prefetch is not called.
|
|
474
|
-
s.PermitUncheckedError();
|
|
475
|
-
return false;
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
// async prefetching is enabled if it's implicit_auto_readahead_ or
|
|
479
|
-
// explicit readahead_size_ is passed along with ReadOptions.async_io =
|
|
480
|
-
// true.
|
|
481
|
-
if (async_io_) {
|
|
482
|
-
// Prefetch n + readahead_size_/2 synchronously as remaining
|
|
483
|
-
// readahead_size_/2 will be prefetched asynchronously.
|
|
484
|
-
s = PrefetchAsync(opts, reader, offset, n, readahead_size_ / 2,
|
|
485
|
-
rate_limiter_priority, copy_to_third_buffer);
|
|
486
|
-
} else {
|
|
487
|
-
s = Prefetch(opts, reader, offset, n + readahead_size_,
|
|
488
|
-
rate_limiter_priority);
|
|
484
|
+
|
|
485
|
+
if (implicit_auto_readahead_) {
|
|
486
|
+
if (!IsEligibleForPrefetch(offset, n)) {
|
|
487
|
+
// Ignore status as Prefetch is not called.
|
|
488
|
+
s.PermitUncheckedError();
|
|
489
|
+
return false;
|
|
489
490
|
}
|
|
490
491
|
}
|
|
492
|
+
|
|
493
|
+
// Prefetch n + readahead_size_/2 synchronously as remaining
|
|
494
|
+
// readahead_size_/2 will be prefetched asynchronously.
|
|
495
|
+
s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2,
|
|
496
|
+
rate_limiter_priority, copy_to_third_buffer);
|
|
491
497
|
if (!s.ok()) {
|
|
492
498
|
if (status) {
|
|
493
499
|
*status = s;
|
|
@@ -544,4 +550,92 @@ void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req,
|
|
|
544
550
|
bufs_[index].buffer_.Size(current_size + req.result.size());
|
|
545
551
|
}
|
|
546
552
|
}
|
|
553
|
+
|
|
554
|
+
Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
555
|
+
RandomAccessFileReader* reader,
|
|
556
|
+
uint64_t offset, size_t n,
|
|
557
|
+
Env::IOPriority rate_limiter_priority,
|
|
558
|
+
Slice* result) {
|
|
559
|
+
assert(reader != nullptr);
|
|
560
|
+
if (!enable_) {
|
|
561
|
+
return Status::NotSupported();
|
|
562
|
+
}
|
|
563
|
+
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
|
|
564
|
+
|
|
565
|
+
PollAndUpdateBuffersIfNeeded(offset);
|
|
566
|
+
|
|
567
|
+
// Index of second buffer.
|
|
568
|
+
uint32_t second = curr_ ^ 1;
|
|
569
|
+
|
|
570
|
+
// Since PrefetchAsync can be called on non sequential reads. So offset can
|
|
571
|
+
// be less than buffers' offset. In that case it clears the buffer and
|
|
572
|
+
// prefetch that block.
|
|
573
|
+
if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_) {
|
|
574
|
+
bufs_[curr_].buffer_.Clear();
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// All requested bytes are already in the curr_ buffer. So no need to Read
|
|
578
|
+
// again.
|
|
579
|
+
if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
|
|
580
|
+
offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
|
|
581
|
+
uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
|
|
582
|
+
*result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
|
|
583
|
+
return Status::OK();
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
Status s;
|
|
587
|
+
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
588
|
+
|
|
589
|
+
// TODO akanksha: Handle the scenario if data is overlapping in 2 buffers.
|
|
590
|
+
// Currently, tt covers 2 scenarios. Either one buffer (curr_) has no data or
|
|
591
|
+
// it has partial data. It ignores the contents in second buffer (overlapping
|
|
592
|
+
// data in 2 buffers) and send the request to re-read that data again.
|
|
593
|
+
|
|
594
|
+
// Clear the second buffer in order to do asynchronous prefetching.
|
|
595
|
+
bufs_[second].buffer_.Clear();
|
|
596
|
+
|
|
597
|
+
size_t offset_to_read = static_cast<size_t>(offset);
|
|
598
|
+
uint64_t rounddown_start = 0;
|
|
599
|
+
uint64_t roundup_end = 0;
|
|
600
|
+
|
|
601
|
+
if (bufs_[curr_].buffer_.CurrentSize() == 0) {
|
|
602
|
+
// Prefetch full data.
|
|
603
|
+
rounddown_start = Rounddown(offset_to_read, alignment);
|
|
604
|
+
roundup_end = Roundup(offset_to_read + n, alignment);
|
|
605
|
+
} else {
|
|
606
|
+
// Prefetch remaining data.
|
|
607
|
+
size_t rem_length = n - (bufs_[curr_].buffer_.CurrentSize() -
|
|
608
|
+
(offset - bufs_[curr_].offset_));
|
|
609
|
+
rounddown_start = bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
|
|
610
|
+
roundup_end = Roundup(rounddown_start + rem_length, alignment);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
uint64_t roundup_len = roundup_end - rounddown_start;
|
|
614
|
+
assert(roundup_len >= alignment);
|
|
615
|
+
assert(roundup_len % alignment == 0);
|
|
616
|
+
|
|
617
|
+
uint64_t chunk_len = 0;
|
|
618
|
+
CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, second, false,
|
|
619
|
+
chunk_len);
|
|
620
|
+
|
|
621
|
+
// Update the buffer offset.
|
|
622
|
+
bufs_[second].offset_ = rounddown_start;
|
|
623
|
+
assert(roundup_len >= chunk_len);
|
|
624
|
+
|
|
625
|
+
size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
|
|
626
|
+
|
|
627
|
+
s = ReadAsync(opts, reader, rate_limiter_priority, read_len, chunk_len,
|
|
628
|
+
rounddown_start, second);
|
|
629
|
+
|
|
630
|
+
if (!s.ok()) {
|
|
631
|
+
return s;
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// Update read pattern so that TryReadFromCacheAsync call be called to Poll
|
|
635
|
+
// the data. It will return without polling if blocks are not sequential.
|
|
636
|
+
UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
|
|
637
|
+
prev_len_ = 0;
|
|
638
|
+
|
|
639
|
+
return Status::TryAgain();
|
|
640
|
+
}
|
|
547
641
|
} // namespace ROCKSDB_NAMESPACE
|