@nxtedition/rocksdb 13.5.13 → 14.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +33 -2
- package/binding.gyp +2 -2
- package/chained-batch.js +9 -16
- package/deps/rocksdb/rocksdb/BUCK +18 -1
- package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
- package/deps/rocksdb/rocksdb/Makefile +20 -9
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +90 -13
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -75
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +44 -36
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +184 -148
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +5 -11
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +116 -47
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +3 -6
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -1
- package/deps/rocksdb/rocksdb/db/builder.cc +4 -2
- package/deps/rocksdb/rocksdb/db/c.cc +207 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +72 -0
- package/deps/rocksdb/rocksdb/db/column_family.cc +3 -2
- package/deps/rocksdb/rocksdb/db/column_family.h +5 -0
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +2 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +51 -38
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +29 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +5 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +566 -366
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +131 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +1 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +4 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +13 -14
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +12 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +97 -76
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +11 -14
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +16 -3
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +448 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +22 -20
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +4 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +5 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +7 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_iter.cc +104 -0
- package/deps/rocksdb/rocksdb/db/db_iter.h +4 -11
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +331 -58
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +129 -0
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +64 -0
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +40 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +25 -15
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +42 -24
- package/deps/rocksdb/rocksdb/db/db_test_util.h +29 -14
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +69 -36
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +5 -4
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +8 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +275 -79
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +23 -5
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +591 -175
- package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -4
- package/deps/rocksdb/rocksdb/db/log_reader.cc +5 -2
- package/deps/rocksdb/rocksdb/db/memtable.cc +84 -35
- package/deps/rocksdb/rocksdb/db/memtable.h +39 -34
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -0
- package/deps/rocksdb/rocksdb/db/merge_operator.cc +1 -1
- package/deps/rocksdb/rocksdb/db/multi_scan.cc +11 -5
- package/deps/rocksdb/rocksdb/db/version_edit.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_edit.h +1 -1
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +34 -14
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +28 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +159 -14
- package/deps/rocksdb/rocksdb/db/version_set.h +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +60 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +16 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +75 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.cc +28 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +31 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +50 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +57 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +0 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +266 -35
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -6
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +18 -2
- package/deps/rocksdb/rocksdb/env/env.cc +12 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +18 -0
- package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +2 -0
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +9 -5
- package/deps/rocksdb/rocksdb/env/io_posix.cc +4 -2
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +19 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -31
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +42 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +93 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +43 -49
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +4 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +8 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +487 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +11 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +135 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +12 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +12 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +19 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +219 -24
- package/deps/rocksdb/rocksdb/include/rocksdb/point_lock_bench_tool.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +16 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +16 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +45 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +6 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +3 -3
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +77 -51
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +10 -13
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +16 -7
- package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +9 -4
- package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +2 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +6 -0
- package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -1
- package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
- package/deps/rocksdb/rocksdb/options/options.cc +2 -0
- package/deps/rocksdb/rocksdb/options/options_helper.cc +9 -8
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -5
- package/deps/rocksdb/rocksdb/port/mmap.cc +1 -1
- package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +51 -0
- package/deps/rocksdb/rocksdb/port/win/xpress_win.h +4 -0
- package/deps/rocksdb/rocksdb/src.mk +8 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1125 -765
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +35 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +29 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +732 -256
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +225 -16
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -26
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -75
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +433 -141
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +17 -10
- package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy_impl.h +20 -0
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +112 -85
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +191 -36
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +2 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +108 -31
- package/deps/rocksdb/rocksdb/table/external_table.cc +7 -3
- package/deps/rocksdb/rocksdb/table/format.cc +6 -12
- package/deps/rocksdb/rocksdb/table/format.h +10 -0
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +1 -1
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -1
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +5 -0
- package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +118 -46
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +9 -8
- package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
- package/deps/rocksdb/rocksdb/table/table_properties.cc +16 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +1540 -155
- package/deps/rocksdb/rocksdb/test_util/testutil.h +21 -5
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +26 -5
- package/deps/rocksdb/rocksdb/tools/ldb.cc +1 -2
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +2 -0
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -3
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +133 -165
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +173 -64
- package/deps/rocksdb/rocksdb/util/aligned_buffer.h +69 -0
- package/deps/rocksdb/rocksdb/util/atomic.h +6 -0
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +29 -20
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +10 -6
- package/deps/rocksdb/rocksdb/util/bit_fields.h +338 -0
- package/deps/rocksdb/rocksdb/util/coding.h +3 -3
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/util/compression.cc +777 -82
- package/deps/rocksdb/rocksdb/util/compression.h +5 -0
- package/deps/rocksdb/rocksdb/util/compression_test.cc +5 -3
- package/deps/rocksdb/rocksdb/util/dynamic_bloom.cc +2 -2
- package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +15 -14
- package/deps/rocksdb/rocksdb/util/interval_test.cc +102 -0
- package/deps/rocksdb/rocksdb/util/semaphore.h +164 -0
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +10 -6
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -2
- package/deps/rocksdb/rocksdb/util/slice_test.cc +136 -0
- package/deps/rocksdb/rocksdb/util/status.cc +1 -0
- package/deps/rocksdb/rocksdb/util/string_util.cc +2 -16
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +7 -4
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +35 -14
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_test.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +5 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/any_lock_manager_test.h +244 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench.cc +18 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench_tool.cc +159 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +1244 -161
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +66 -12
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_stress_test.cc +103 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +1275 -8
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +40 -262
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test_common.h +78 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_validation_test_runner.h +469 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -6
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +4 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +9 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +18 -9
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +72 -44
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +92 -15
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +6 -20
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +143 -112
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +23 -16
- package/index.js +3 -3
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
- package/util.h +38 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc +0 -17
|
@@ -37,14 +37,14 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
|
|
|
37
37
|
bool async_prefetch) {
|
|
38
38
|
// TODO(hx235): set `seek_key_prefix_for_readahead_trimming_`
|
|
39
39
|
// even when `target == nullptr` that is when `SeekToFirst()` is called
|
|
40
|
+
if (!multi_scan_status_.ok()) {
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
40
43
|
if (multi_scan_) {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
}
|
|
44
|
+
SeekMultiScan(target);
|
|
45
|
+
return;
|
|
44
46
|
}
|
|
45
47
|
|
|
46
|
-
assert(!multi_scan_);
|
|
47
|
-
|
|
48
48
|
if (target != nullptr && prefix_extractor_ &&
|
|
49
49
|
read_options_.prefix_same_as_start) {
|
|
50
50
|
const Slice& seek_user_key = ExtractUserKey(*target);
|
|
@@ -919,351 +919,827 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
|
|
|
919
919
|
ResetPreviousBlockOffset();
|
|
920
920
|
}
|
|
921
921
|
|
|
922
|
+
BlockBasedTableIterator::MultiScanState::~MultiScanState() {
|
|
923
|
+
// Abort any pending async IO operations to prevent callback being called
|
|
924
|
+
// after async read states are destructed.
|
|
925
|
+
if (!async_states.empty()) {
|
|
926
|
+
std::vector<void*> io_handles_to_abort;
|
|
927
|
+
std::vector<AsyncReadState*> states_to_cleanup;
|
|
928
|
+
|
|
929
|
+
// Collect all pending IO handles
|
|
930
|
+
for (size_t i = 0; i < async_states.size(); ++i) {
|
|
931
|
+
auto& async_read = async_states[i];
|
|
932
|
+
|
|
933
|
+
if (async_read.io_handle != nullptr) {
|
|
934
|
+
assert(!async_read.finished);
|
|
935
|
+
io_handles_to_abort.push_back(async_read.io_handle);
|
|
936
|
+
states_to_cleanup.push_back(&async_read);
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
if (!io_handles_to_abort.empty()) {
|
|
941
|
+
IOStatus abort_status = fs->AbortIO(io_handles_to_abort);
|
|
942
|
+
if (!abort_status.ok()) {
|
|
943
|
+
#ifndef NDEBUG
|
|
944
|
+
fprintf(stderr, "Error aborting async IO operations: %s\n",
|
|
945
|
+
abort_status.ToString().c_str());
|
|
946
|
+
#endif
|
|
947
|
+
assert(false);
|
|
948
|
+
}
|
|
949
|
+
(void)abort_status; // Suppress unused variable warning
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
for (auto async_read : states_to_cleanup) {
|
|
953
|
+
async_read->CleanUpIOHandle();
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
922
958
|
// Note:
|
|
923
959
|
// - Iterator should not be reused for multiple multiscans or mixing
|
|
924
960
|
// multiscan with regular iterator usage.
|
|
925
961
|
// - scan ranges should be non-overlapping, and have increasing start keys.
|
|
926
962
|
// If a scan range's limit is not set, then there should only be one scan range.
|
|
927
963
|
// - After Prepare(), the iterator expects Seek to be called on the start key
|
|
928
|
-
// of each ScanOption in order. If any other
|
|
929
|
-
//
|
|
964
|
+
// of each ScanOption in order. If any other Seek is done, an error status is
|
|
965
|
+
// returned
|
|
966
|
+
// - Whenever all blocks of a scan opt are exhausted, the iterator will become
|
|
967
|
+
// invalid and UpperBoundCheckResult() will return kOutOfBound. So that the
|
|
968
|
+
// upper layer (LevelIterator) will stop scanning instead thinking EOF is
|
|
969
|
+
// reached and continue into the next file. The only exception is for the last
|
|
970
|
+
// scan opt. If we reach the end of the last scan opt, UpperBoundCheckResult()
|
|
971
|
+
// will return kUnknown instead of kOutOfBound. This mechanism requires that
|
|
972
|
+
// scan opts are properly pruned such that there is no scan opt that is after
|
|
973
|
+
// this file's key range.
|
|
930
974
|
// FIXME: DBIter and MergingIterator may
|
|
931
975
|
// internally do Seek() on child iterators, e.g. due to
|
|
932
976
|
// ReadOptions::max_skippable_internal_keys or reseeking into range deletion
|
|
933
|
-
// end key.
|
|
934
|
-
//
|
|
935
|
-
void BlockBasedTableIterator::Prepare(
|
|
936
|
-
const std::vector<ScanOptions>* scan_opts) {
|
|
937
|
-
index_iter_->Prepare(scan_opts);
|
|
938
|
-
|
|
977
|
+
// end key. These Seeks will be handled properly, as long as the target is
|
|
978
|
+
// moving forward.
|
|
979
|
+
void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
|
|
939
980
|
assert(!multi_scan_);
|
|
981
|
+
if (!index_iter_->status().ok()) {
|
|
982
|
+
multi_scan_status_ = index_iter_->status();
|
|
983
|
+
return;
|
|
984
|
+
}
|
|
940
985
|
if (multi_scan_) {
|
|
941
986
|
multi_scan_.reset();
|
|
987
|
+
multi_scan_status_ = Status::InvalidArgument("Prepare already called");
|
|
942
988
|
return;
|
|
943
989
|
}
|
|
944
|
-
|
|
990
|
+
|
|
991
|
+
index_iter_->Prepare(multiscan_opts);
|
|
992
|
+
|
|
993
|
+
std::vector<BlockHandle> scan_block_handles;
|
|
994
|
+
std::vector<std::string> data_block_separators;
|
|
995
|
+
std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
|
|
996
|
+
const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
|
|
997
|
+
multi_scan_status_ =
|
|
998
|
+
CollectBlockHandles(scan_opts, &scan_block_handles,
|
|
999
|
+
&block_index_ranges_per_scan, &data_block_separators);
|
|
1000
|
+
if (!multi_scan_status_.ok()) {
|
|
945
1001
|
return;
|
|
946
1002
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
1003
|
+
|
|
1004
|
+
// Pin already cached blocks, collect remaining blocks to read
|
|
1005
|
+
std::vector<size_t> block_indices_to_read;
|
|
1006
|
+
std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
|
|
1007
|
+
scan_block_handles.size());
|
|
1008
|
+
size_t prefetched_max_idx;
|
|
1009
|
+
multi_scan_status_ = FilterAndPinCachedBlocks(
|
|
1010
|
+
scan_block_handles, multiscan_opts, &block_indices_to_read,
|
|
1011
|
+
&pinned_data_blocks_guard, &prefetched_max_idx);
|
|
1012
|
+
if (!multi_scan_status_.ok()) {
|
|
950
1013
|
return;
|
|
951
1014
|
}
|
|
952
1015
|
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
1016
|
+
std::vector<AsyncReadState> async_states;
|
|
1017
|
+
// Maps from block index into async read request (index into async_states[])
|
|
1018
|
+
UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
|
|
1019
|
+
if (!block_indices_to_read.empty()) {
|
|
1020
|
+
std::vector<FSReadRequest> read_reqs;
|
|
1021
|
+
std::vector<std::vector<size_t>> coalesced_block_indices;
|
|
1022
|
+
PrepareIORequests(block_indices_to_read, scan_block_handles, multiscan_opts,
|
|
1023
|
+
&read_reqs, &block_idx_to_readreq_idx,
|
|
1024
|
+
&coalesced_block_indices);
|
|
1025
|
+
|
|
1026
|
+
multi_scan_status_ =
|
|
1027
|
+
ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
|
|
1028
|
+
&read_reqs, &async_states, &pinned_data_blocks_guard);
|
|
1029
|
+
if (!multi_scan_status_.ok()) {
|
|
958
1030
|
return;
|
|
959
1031
|
}
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// Successful Prepare, init related states so the iterator reads from prepared
|
|
1035
|
+
// blocks.
|
|
1036
|
+
multi_scan_ = std::make_unique<MultiScanState>(
|
|
1037
|
+
table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
|
|
1038
|
+
std::move(pinned_data_blocks_guard), std::move(data_block_separators),
|
|
1039
|
+
std::move(block_index_ranges_per_scan),
|
|
1040
|
+
std::move(block_idx_to_readreq_idx), std::move(async_states),
|
|
1041
|
+
prefetched_max_idx);
|
|
960
1042
|
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1043
|
+
is_index_at_curr_block_ = false;
|
|
1044
|
+
block_iter_points_to_real_block_ = false;
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
|
|
1048
|
+
if (SeekMultiScanImpl(seek_target)) {
|
|
1049
|
+
is_out_of_bound_ = true;
|
|
1050
|
+
assert(!Valid());
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
|
|
1055
|
+
assert(multi_scan_ && multi_scan_status_.ok());
|
|
1056
|
+
// This is a MultiScan and Preapre() has been called.
|
|
1057
|
+
|
|
1058
|
+
// Reset out of bound on seek, if it is out of bound again, it will be set
|
|
1059
|
+
// properly later in the code path
|
|
1060
|
+
is_out_of_bound_ = false;
|
|
1061
|
+
|
|
1062
|
+
// Validate seek key with scan options
|
|
1063
|
+
if (!seek_target) {
|
|
1064
|
+
// start key must be set for multi-scan
|
|
1065
|
+
multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
|
|
1066
|
+
return false;
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
constexpr auto out_of_bound = true;
|
|
1070
|
+
|
|
1071
|
+
// Check the case where there is no range prepared on this table
|
|
1072
|
+
if (multi_scan_->scan_opts->size() == 0) {
|
|
1073
|
+
// out of bound
|
|
1074
|
+
return out_of_bound;
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// Check whether seek key is moving forward.
|
|
1078
|
+
if (!multi_scan_->prev_seek_key_.empty()) {
|
|
1079
|
+
if (user_comparator_.CompareWithoutTimestamp(ExtractUserKey(*seek_target),
|
|
1080
|
+
/*a_has_ts=*/true,
|
|
1081
|
+
multi_scan_->prev_seek_key_,
|
|
1082
|
+
/*b_has_ts=*/false) < 0) {
|
|
1083
|
+
// The seek target moved backward
|
|
1084
|
+
multi_scan_status_ =
|
|
1085
|
+
Status::InvalidArgument("Unexpected seek key moving backward");
|
|
1086
|
+
return false;
|
|
965
1087
|
}
|
|
1088
|
+
}
|
|
1089
|
+
multi_scan_->prev_seek_key_ = ExtractUserKey(*seek_target).ToString();
|
|
1090
|
+
|
|
1091
|
+
// There are still a few cases we need to handle
|
|
1092
|
+
// table: _____[prepared range 1]_____[prepared range 2]_____
|
|
1093
|
+
// seek : 1 2 3 4 5
|
|
1094
|
+
// Case 1: seek before the first prepared ranges, return out of bound
|
|
1095
|
+
// Case 2: seek at the beginning of a prepared range (expected case)
|
|
1096
|
+
// Case 3: seek within a prepared range (unexpected, but supported)
|
|
1097
|
+
// Case 4: seek between 2 of the prepared ranges, return out of bound
|
|
1098
|
+
// Case 5: seek after all of the prepared ranges, should move on to next file
|
|
1099
|
+
// The reason this could happen is due to seek key adjustment due to delete
|
|
1100
|
+
// range file.
|
|
1101
|
+
// E.g. LSM has 3 levels, each level has only 1 file:
|
|
1102
|
+
// L1 : key : 0---10
|
|
1103
|
+
// L2 : Delete range key : 0-5
|
|
1104
|
+
// L3 : key : 0---10
|
|
1105
|
+
// When a range 2-8 was prepared, the prepared key would be 2 on L3 file, but
|
|
1106
|
+
// the seek key would be 5, as the seek key was updated by the largest key of
|
|
1107
|
+
// delete range. This causes all of the cases above to be possible, when the
|
|
1108
|
+
// ranges are adjusted in the above examples.
|
|
1109
|
+
|
|
1110
|
+
// Allow reseek on the start of the last prepared range due to too many
|
|
1111
|
+
// tombstone
|
|
1112
|
+
multi_scan_->next_scan_idx =
|
|
1113
|
+
std::min(multi_scan_->next_scan_idx,
|
|
1114
|
+
multi_scan_->block_index_ranges_per_scan.size() - 1);
|
|
1115
|
+
|
|
1116
|
+
auto compare_next_scan_start_result =
|
|
1117
|
+
user_comparator_.CompareWithoutTimestamp(
|
|
1118
|
+
ExtractUserKey(*seek_target), /*a_has_ts=*/true,
|
|
1119
|
+
multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
|
|
1120
|
+
.range.start.value(),
|
|
1121
|
+
/*b_has_ts=*/false);
|
|
1122
|
+
|
|
1123
|
+
if (compare_next_scan_start_result != 0) {
|
|
1124
|
+
// The seek key is not exactly same as what was prepared.
|
|
1125
|
+
if (compare_next_scan_start_result < 0) {
|
|
1126
|
+
// Needs to handle Cases: 1, 3, 4
|
|
1127
|
+
//
|
|
1128
|
+
// next_scan_idx : |
|
|
1129
|
+
// V
|
|
1130
|
+
// table: _____[prepared range 1]_____[prepared range 2]_____
|
|
1131
|
+
// seek : 1 3 4
|
|
1132
|
+
|
|
1133
|
+
// Case 1: Seek key is before the start key of the first range
|
|
1134
|
+
if (multi_scan_->next_scan_idx == 0) {
|
|
1135
|
+
return out_of_bound;
|
|
1136
|
+
}
|
|
1137
|
+
// Case: 3, 4
|
|
1138
|
+
MultiScanUnexpectedSeekTarget(
|
|
1139
|
+
seek_target, std::get<0>(multi_scan_->block_index_ranges_per_scan
|
|
1140
|
+
[multi_scan_->next_scan_idx - 1]));
|
|
966
1141
|
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1142
|
+
} else {
|
|
1143
|
+
// Needs to handle Cases: 3, 4, 5
|
|
1144
|
+
// next_scan_idx :|
|
|
1145
|
+
// V
|
|
1146
|
+
// table: ____[prepared range 1]_____[prepared range 2]_____
|
|
1147
|
+
// seek : 3 4 5
|
|
1148
|
+
MultiScanUnexpectedSeekTarget(
|
|
1149
|
+
seek_target,
|
|
1150
|
+
std::get<0>(
|
|
1151
|
+
multi_scan_
|
|
1152
|
+
->block_index_ranges_per_scan[multi_scan_->next_scan_idx]));
|
|
1153
|
+
}
|
|
1154
|
+
} else {
|
|
1155
|
+
if (multi_scan_->next_scan_idx >=
|
|
1156
|
+
multi_scan_->block_index_ranges_per_scan.size()) {
|
|
1157
|
+
// Seeking a range that is out side of prepared ranges.
|
|
1158
|
+
return out_of_bound;
|
|
1159
|
+
}
|
|
1160
|
+
// unpin block, then do a seek.
|
|
1161
|
+
if (multi_scan_->next_scan_idx > 0) {
|
|
1162
|
+
UnpinPreviousScanBlocks(multi_scan_->next_scan_idx);
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
auto [cur_scan_start_idx, cur_scan_end_idx] =
|
|
1166
|
+
multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
|
|
1167
|
+
// We should have the data block already loaded
|
|
1168
|
+
++multi_scan_->next_scan_idx;
|
|
1169
|
+
if (cur_scan_start_idx >= cur_scan_end_idx) {
|
|
1170
|
+
if (multi_scan_->next_scan_idx <
|
|
1171
|
+
multi_scan_->block_index_ranges_per_scan.size()) {
|
|
1172
|
+
return out_of_bound;
|
|
1173
|
+
} else {
|
|
1174
|
+
ResetDataIter();
|
|
1175
|
+
return false;
|
|
971
1176
|
}
|
|
1177
|
+
} else {
|
|
1178
|
+
is_out_of_bound_ = false;
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
MultiScanSeekTargetFromBlock(seek_target, cur_scan_start_idx);
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
return false;
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
|
|
1188
|
+
const Slice* seek_target, size_t block_idx) {
|
|
1189
|
+
// linear search the block that contains the seek target, and unpin blocks
|
|
1190
|
+
// that are before it.
|
|
1191
|
+
auto const& data_block_separators = multi_scan_->data_block_separators;
|
|
1192
|
+
while (block_idx < data_block_separators.size() &&
|
|
1193
|
+
(user_comparator_.CompareWithoutTimestamp(
|
|
1194
|
+
ExtractUserKey(*seek_target), /*a_has_ts=*/true,
|
|
1195
|
+
data_block_separators[block_idx],
|
|
1196
|
+
/*b_has_ts=*/false) > 0)) {
|
|
1197
|
+
if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
|
|
1198
|
+
multi_scan_->pinned_data_blocks[block_idx].Reset();
|
|
1199
|
+
}
|
|
1200
|
+
block_idx++;
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
if (block_idx >= data_block_separators.size()) {
|
|
1204
|
+
// Handle case 5, when seek key is larger than the last block in the last
|
|
1205
|
+
// prepared range.
|
|
1206
|
+
ResetDataIter();
|
|
1207
|
+
assert(!Valid());
|
|
1208
|
+
return;
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
// // The iterator from previous seek may have moved forward a few blocks,
|
|
1212
|
+
// // In that case, have block_idx catch up the cur_data_block_idx
|
|
1213
|
+
// // Note no need to handle block unpin, as it has been handled during
|
|
1214
|
+
// iterating block_idx = std::max(block_idx, multi_scan_->cur_data_block_idx);
|
|
1215
|
+
|
|
1216
|
+
// advance to the right prepared range
|
|
1217
|
+
while (
|
|
1218
|
+
multi_scan_->next_scan_idx <
|
|
1219
|
+
multi_scan_->block_index_ranges_per_scan.size() &&
|
|
1220
|
+
(user_comparator_.CompareWithoutTimestamp(
|
|
1221
|
+
ExtractUserKey(*seek_target), /*a_has_ts=*/true,
|
|
1222
|
+
multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
|
|
1223
|
+
.range.start.value(),
|
|
1224
|
+
/*b_has_ts=*/false) >= 0)) {
|
|
1225
|
+
multi_scan_->next_scan_idx++;
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
// The current block may contain the data for the target key
|
|
1229
|
+
MultiScanSeekTargetFromBlock(seek_target, block_idx);
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
|
|
1233
|
+
const Slice* seek_target, size_t block_idx) {
|
|
1234
|
+
if (!block_iter_points_to_real_block_ ||
|
|
1235
|
+
multi_scan_->cur_data_block_idx != block_idx) {
|
|
1236
|
+
if (block_iter_points_to_real_block_) {
|
|
1237
|
+
// Should be scan in increasing key range.
|
|
1238
|
+
// All blocks before cur_data_block_idx_ are not pinned anymore.
|
|
1239
|
+
assert(multi_scan_->cur_data_block_idx < block_idx);
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
ResetDataIter();
|
|
1243
|
+
|
|
1244
|
+
if (MultiScanLoadDataBlock(block_idx)) {
|
|
1245
|
+
return;
|
|
1246
|
+
}
|
|
1247
|
+
}
|
|
1248
|
+
multi_scan_->cur_data_block_idx = block_idx;
|
|
1249
|
+
block_iter_points_to_real_block_ = true;
|
|
1250
|
+
block_iter_.Seek(*seek_target);
|
|
1251
|
+
FindKeyForward();
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
|
|
1255
|
+
// TODO: support aborting and clearn up async IO requests, currently
|
|
1256
|
+
// only unpins already initialized blocks
|
|
1257
|
+
assert(multi_scan_);
|
|
1258
|
+
assert(current_scan_idx < multi_scan_->block_index_ranges_per_scan.size());
|
|
1259
|
+
if (current_scan_idx == 0) return;
|
|
1260
|
+
|
|
1261
|
+
auto prev_start_block_idx = std::get<0>(
|
|
1262
|
+
multi_scan_->block_index_ranges_per_scan[current_scan_idx - 1]);
|
|
1263
|
+
// Since a block can be shared between consecutive scans, we need
|
|
1264
|
+
// curr_start_block_idx here instead of just release blocks
|
|
1265
|
+
// up to the end of previous range block index.
|
|
1266
|
+
auto curr_start_block_idx =
|
|
1267
|
+
std::get<0>(multi_scan_->block_index_ranges_per_scan[current_scan_idx]);
|
|
1268
|
+
for (size_t block_idx = prev_start_block_idx;
|
|
1269
|
+
block_idx < curr_start_block_idx; ++block_idx) {
|
|
1270
|
+
if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
|
|
1271
|
+
multi_scan_->pinned_data_blocks[block_idx].Reset();
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
972
1275
|
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
1276
|
+
void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
|
|
1277
|
+
assert(multi_scan_);
|
|
1278
|
+
assert(multi_scan_->next_scan_idx >= 1);
|
|
1279
|
+
const auto cur_scan_end_idx = std::get<1>(
|
|
1280
|
+
multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
|
|
1281
|
+
do {
|
|
1282
|
+
if (!block_iter_.status().ok()) {
|
|
1283
|
+
return;
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
// If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
|
|
1287
|
+
// level has reached iterate_upper_bound_ and will not continue to iterate
|
|
1288
|
+
// into the next file. When we are doing the last scan within a MultiScan
|
|
1289
|
+
// for this file, it may need to continue to scan into the next file, so
|
|
1290
|
+
// we do not set is_out_of_bound_ in this case.
|
|
1291
|
+
if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
|
|
1292
|
+
if (multi_scan_->next_scan_idx >=
|
|
1293
|
+
multi_scan_->block_index_ranges_per_scan.size()) {
|
|
1294
|
+
// We are done with this file, should let LevelIter advance to the
|
|
1295
|
+
// next file instead of ending the scan
|
|
1296
|
+
ResetDataIter();
|
|
1297
|
+
assert(!is_out_of_bound_);
|
|
1298
|
+
assert(!Valid());
|
|
977
1299
|
return;
|
|
978
1300
|
}
|
|
1301
|
+
// We don't ResetDataIter() here since next scan might be reading from
|
|
1302
|
+
// the same block. ResetDataIter() will free the underlying block cache
|
|
1303
|
+
// handle and we don't want the block to be unpinned.
|
|
1304
|
+
is_out_of_bound_ = true;
|
|
1305
|
+
assert(!Valid());
|
|
1306
|
+
return;
|
|
1307
|
+
}
|
|
1308
|
+
// Move to the next pinned data block
|
|
1309
|
+
ResetDataIter();
|
|
1310
|
+
++multi_scan_->cur_data_block_idx;
|
|
1311
|
+
|
|
1312
|
+
if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
|
|
1313
|
+
return;
|
|
979
1314
|
}
|
|
1315
|
+
|
|
1316
|
+
block_iter_points_to_real_block_ = true;
|
|
1317
|
+
block_iter_.SeekToFirst();
|
|
1318
|
+
} while (!block_iter_.Valid());
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
Status BlockBasedTableIterator::PollForBlock(size_t idx) {
|
|
1322
|
+
assert(multi_scan_);
|
|
1323
|
+
const auto async_idx = multi_scan_->block_idx_to_readreq_idx.find(idx);
|
|
1324
|
+
if (async_idx == multi_scan_->block_idx_to_readreq_idx.end()) {
|
|
1325
|
+
// Did not require async read, should already be pinned.
|
|
1326
|
+
assert(multi_scan_->pinned_data_blocks[idx].GetValue());
|
|
1327
|
+
return Status::OK();
|
|
980
1328
|
}
|
|
981
1329
|
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
1330
|
+
AsyncReadState& async_read = multi_scan_->async_states[async_idx->second];
|
|
1331
|
+
if (async_read.finished) {
|
|
1332
|
+
assert(async_read.io_handle == nullptr);
|
|
1333
|
+
assert(async_read.status.ok());
|
|
1334
|
+
return async_read.status;
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
{
|
|
1338
|
+
std::vector<void*> handles = {async_read.io_handle};
|
|
1339
|
+
Status poll_s =
|
|
1340
|
+
table_->get_rep()->ioptions.env->GetFileSystem()->Poll(handles, 1);
|
|
1341
|
+
if (!poll_s.ok()) {
|
|
1342
|
+
return poll_s;
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
assert(async_read.status.ok());
|
|
1346
|
+
if (!async_read.status.ok()) {
|
|
1347
|
+
return async_read.status;
|
|
1348
|
+
}
|
|
1349
|
+
async_read.CleanUpIOHandle();
|
|
1350
|
+
|
|
1351
|
+
// Initialize and pin blocks from async read result.
|
|
1352
|
+
for (size_t i = 0; i < async_read.blocks.size(); ++i) {
|
|
1353
|
+
const auto& block = async_read.blocks[i];
|
|
1354
|
+
|
|
1355
|
+
Status s = CreateAndPinBlockFromBuffer(
|
|
1356
|
+
block, async_read.offset, async_read.result,
|
|
1357
|
+
multi_scan_->pinned_data_blocks[async_read.block_indices[i]]);
|
|
1358
|
+
|
|
1359
|
+
if (!s.ok()) {
|
|
1360
|
+
return s;
|
|
1361
|
+
}
|
|
1362
|
+
assert(multi_scan_->pinned_data_blocks[async_read.block_indices[i]]
|
|
1363
|
+
.GetValue());
|
|
1364
|
+
}
|
|
1365
|
+
assert(multi_scan_->pinned_data_blocks[idx].GetValue());
|
|
1366
|
+
return Status::OK();
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
|
|
1370
|
+
const BlockHandle& block, uint64_t buffer_start_offset,
|
|
1371
|
+
const Slice& buffer_data, CachableEntry<Block>& pinned_block_entry) {
|
|
1372
|
+
// Get decompressor and handle dictionary loading
|
|
1373
|
+
UnownedPtr<Decompressor> decompressor = table_->get_rep()->decompressor.get();
|
|
1374
|
+
CachableEntry<DecompressorDict> cached_dict;
|
|
1375
|
+
|
|
1376
|
+
if (table_->get_rep()->uncompression_dict_reader) {
|
|
1377
|
+
{
|
|
1378
|
+
Status s =
|
|
1379
|
+
table_->get_rep()
|
|
1380
|
+
->uncompression_dict_reader->GetOrReadUncompressionDictionary(
|
|
1381
|
+
/* prefetch_buffer= */ nullptr, read_options_,
|
|
1382
|
+
/* get_context= */ nullptr, /* lookup_context= */ nullptr,
|
|
1383
|
+
&cached_dict);
|
|
1384
|
+
if (!s.ok()) {
|
|
1385
|
+
#ifndef NDEBUG
|
|
1386
|
+
fprintf(stdout, "Prepare dictionary loading failed with %s\n",
|
|
1387
|
+
s.ToString().c_str());
|
|
1388
|
+
#endif
|
|
1389
|
+
return s;
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
if (!cached_dict.GetValue()) {
|
|
1393
|
+
#ifndef NDEBUG
|
|
1394
|
+
fprintf(stdout, "Success but no dictionary read\n");
|
|
1395
|
+
#endif
|
|
1396
|
+
return Status::InvalidArgument("No dictionary found");
|
|
1397
|
+
}
|
|
1398
|
+
decompressor = cached_dict.GetValue()->decompressor_.get();
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
// Create block from buffer data
|
|
1402
|
+
const auto block_size_with_trailer =
|
|
1403
|
+
BlockBasedTable::BlockSizeWithTrailer(block);
|
|
1404
|
+
const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
|
|
1405
|
+
|
|
1406
|
+
CacheAllocationPtr data =
|
|
1407
|
+
AllocateBlock(block_size_with_trailer,
|
|
1408
|
+
GetMemoryAllocator(table_->get_rep()->table_options));
|
|
1409
|
+
memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
|
|
1410
|
+
block_size_with_trailer);
|
|
1411
|
+
BlockContents tmp_contents(std::move(data), block.size());
|
|
1412
|
+
|
|
1413
|
+
#ifndef NDEBUG
|
|
1414
|
+
tmp_contents.has_trailer =
|
|
1415
|
+
table_->get_rep()->footer.GetBlockTrailerSize() > 0;
|
|
1416
|
+
#endif
|
|
990
1417
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
1418
|
+
return table_->CreateAndPinBlockInCache<Block_kData>(
|
|
1419
|
+
read_options_, block, decompressor, &tmp_contents,
|
|
1420
|
+
&pinned_block_entry.As<Block_kData>());
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
constexpr auto kVerbose = false;
|
|
1424
|
+
|
|
1425
|
+
Status BlockBasedTableIterator::CollectBlockHandles(
|
|
1426
|
+
const std::vector<ScanOptions>& scan_opts,
|
|
1427
|
+
std::vector<BlockHandle>* scan_block_handles,
|
|
1428
|
+
std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
|
|
1429
|
+
std::vector<std::string>* data_block_separators) {
|
|
1430
|
+
// print file name and level
|
|
1431
|
+
if (kVerbose) {
|
|
1432
|
+
auto file_name = table_->get_rep()->file->file_name();
|
|
1433
|
+
auto level = table_->get_rep()->level;
|
|
1434
|
+
printf("file name : %s, level %d\n", file_name.c_str(), level);
|
|
1435
|
+
}
|
|
1436
|
+
for (const auto& scan_opt : scan_opts) {
|
|
1437
|
+
size_t num_blocks = 0;
|
|
1438
|
+
bool check_overlap = !scan_block_handles->empty();
|
|
1439
|
+
|
|
1440
|
+
InternalKey start_key;
|
|
1441
|
+
const size_t timestamp_size =
|
|
1442
|
+
user_comparator_.user_comparator()->timestamp_size();
|
|
1443
|
+
if (timestamp_size == 0) {
|
|
1444
|
+
start_key = InternalKey(scan_opt.range.start.value(), kMaxSequenceNumber,
|
|
1445
|
+
kValueTypeForSeek);
|
|
1446
|
+
} else {
|
|
1447
|
+
std::string seek_key;
|
|
1448
|
+
AppendKeyWithMaxTimestamp(&seek_key, scan_opt.range.start.value(),
|
|
1449
|
+
timestamp_size);
|
|
1450
|
+
start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
|
|
1451
|
+
}
|
|
995
1452
|
index_iter_->Seek(start_key.Encode());
|
|
996
|
-
while (index_iter_->Valid() &&
|
|
1453
|
+
while (index_iter_->status().ok() && index_iter_->Valid() &&
|
|
997
1454
|
(!scan_opt.range.limit.has_value() ||
|
|
998
|
-
user_comparator_.CompareWithoutTimestamp(
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1455
|
+
user_comparator_.CompareWithoutTimestamp(index_iter_->user_key(),
|
|
1456
|
+
/*a_has_ts*/ true,
|
|
1457
|
+
*scan_opt.range.limit,
|
|
1458
|
+
/*b_has_ts=*/false) < 0)) {
|
|
1459
|
+
// Only add the block if the index separator is smaller than limit. When
|
|
1460
|
+
// they are equal or larger, it will be handled later below.
|
|
1002
1461
|
if (check_overlap &&
|
|
1003
|
-
|
|
1462
|
+
scan_block_handles->back() == index_iter_->value().handle) {
|
|
1004
1463
|
// Skip the current block since it's already in the list
|
|
1005
1464
|
} else {
|
|
1006
|
-
|
|
1465
|
+
scan_block_handles->push_back(index_iter_->value().handle);
|
|
1466
|
+
// clone the Slice to avoid the lifetime issue
|
|
1467
|
+
data_block_separators->push_back(index_iter_->user_key().ToString());
|
|
1007
1468
|
}
|
|
1008
1469
|
++num_blocks;
|
|
1009
1470
|
index_iter_->Next();
|
|
1010
1471
|
check_overlap = false;
|
|
1011
1472
|
}
|
|
1012
|
-
|
|
1013
|
-
|
|
1473
|
+
|
|
1474
|
+
if (!index_iter_->status().ok()) {
|
|
1475
|
+
// Abort: index iterator error
|
|
1476
|
+
return index_iter_->status();
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1014
1479
|
if (index_iter_->Valid()) {
|
|
1480
|
+
// Handle the last block when its separator is equal or larger than limit
|
|
1015
1481
|
if (check_overlap &&
|
|
1016
|
-
|
|
1482
|
+
scan_block_handles->back() == index_iter_->value().handle) {
|
|
1017
1483
|
// Skip adding the current block since it's already in the list
|
|
1018
1484
|
} else {
|
|
1019
|
-
|
|
1485
|
+
scan_block_handles->push_back(index_iter_->value().handle);
|
|
1486
|
+
data_block_separators->push_back(index_iter_->user_key().ToString());
|
|
1020
1487
|
}
|
|
1021
1488
|
++num_blocks;
|
|
1022
1489
|
}
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1490
|
+
block_index_ranges_per_scan->emplace_back(
|
|
1491
|
+
scan_block_handles->size() - num_blocks, scan_block_handles->size());
|
|
1492
|
+
if (kVerbose) {
|
|
1493
|
+
printf("separators :");
|
|
1494
|
+
for (const auto& separator : *data_block_separators) {
|
|
1495
|
+
printf("%s, ", separator.c_str());
|
|
1496
|
+
}
|
|
1497
|
+
printf("\n");
|
|
1027
1498
|
}
|
|
1028
|
-
|
|
1029
|
-
block_ranges_per_scan.emplace_back(blocks_to_prepare.size() - num_blocks,
|
|
1030
|
-
blocks_to_prepare.size());
|
|
1031
1499
|
}
|
|
1500
|
+
return Status::OK();
|
|
1501
|
+
}
|
|
1032
1502
|
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1503
|
+
Status BlockBasedTableIterator::FilterAndPinCachedBlocks(
|
|
1504
|
+
const std::vector<BlockHandle>& scan_block_handles,
|
|
1505
|
+
const MultiScanArgs* multiscan_opts,
|
|
1506
|
+
std::vector<size_t>* block_indices_to_read,
|
|
1507
|
+
std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
|
|
1508
|
+
size_t* prefetched_max_idx) {
|
|
1509
|
+
uint64_t total_prefetch_size = 0;
|
|
1510
|
+
*prefetched_max_idx = scan_block_handles.size();
|
|
1511
|
+
|
|
1512
|
+
for (size_t i = 0; i < scan_block_handles.size(); ++i) {
|
|
1513
|
+
const auto& data_block_handle = scan_block_handles[i];
|
|
1514
|
+
|
|
1515
|
+
total_prefetch_size +=
|
|
1516
|
+
BlockBasedTable::BlockSizeWithTrailer(data_block_handle);
|
|
1517
|
+
if (multiscan_opts->max_prefetch_size > 0 &&
|
|
1518
|
+
total_prefetch_size > multiscan_opts->max_prefetch_size) {
|
|
1519
|
+
for (size_t j = i; j < scan_block_handles.size(); ++j) {
|
|
1520
|
+
assert((*pinned_data_blocks_guard)[j].IsEmpty());
|
|
1521
|
+
}
|
|
1522
|
+
*prefetched_max_idx = i;
|
|
1523
|
+
break;
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
Status s = table_->LookupAndPinBlocksInCache<Block_kData>(
|
|
1042
1527
|
read_options_, data_block_handle,
|
|
1043
|
-
&pinned_data_blocks_guard[i].As<Block_kData>());
|
|
1528
|
+
&(*pinned_data_blocks_guard)[i].As<Block_kData>());
|
|
1044
1529
|
|
|
1045
1530
|
if (!s.ok()) {
|
|
1046
1531
|
// Abort: block cache look up failed.
|
|
1047
|
-
return;
|
|
1532
|
+
return s;
|
|
1048
1533
|
}
|
|
1049
|
-
if (!pinned_data_blocks_guard[i].GetValue()) {
|
|
1050
|
-
// Block not in cache
|
|
1051
|
-
|
|
1534
|
+
if (!(*pinned_data_blocks_guard)[i].GetValue()) {
|
|
1535
|
+
// Block not in cache
|
|
1536
|
+
block_indices_to_read->emplace_back(i);
|
|
1052
1537
|
}
|
|
1053
1538
|
}
|
|
1539
|
+
return Status::OK();
|
|
1540
|
+
}
|
|
1054
1541
|
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
std::vector<std::vector<size_t
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
if
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1542
|
+
void BlockBasedTableIterator::PrepareIORequests(
|
|
1543
|
+
const std::vector<size_t>& block_indices_to_read,
|
|
1544
|
+
const std::vector<BlockHandle>& scan_block_handles,
|
|
1545
|
+
const MultiScanArgs* multiscan_opts, std::vector<FSReadRequest>* read_reqs,
|
|
1546
|
+
UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
|
|
1547
|
+
std::vector<std::vector<size_t>>* coalesced_block_indices) {
|
|
1548
|
+
assert(coalesced_block_indices->empty());
|
|
1549
|
+
coalesced_block_indices->resize(1);
|
|
1550
|
+
|
|
1551
|
+
for (const auto& block_idx : block_indices_to_read) {
|
|
1552
|
+
if (!coalesced_block_indices->back().empty()) {
|
|
1553
|
+
// Check if we can coalesce.
|
|
1554
|
+
const auto& last_block_handle =
|
|
1555
|
+
scan_block_handles[coalesced_block_indices->back().back()];
|
|
1556
|
+
uint64_t last_block_end =
|
|
1557
|
+
last_block_handle.offset() +
|
|
1558
|
+
BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
|
|
1559
|
+
uint64_t current_start = scan_block_handles[block_idx].offset();
|
|
1560
|
+
|
|
1561
|
+
if (current_start >
|
|
1562
|
+
last_block_end + multiscan_opts->io_coalesce_threshold) {
|
|
1563
|
+
// new IO
|
|
1564
|
+
coalesced_block_indices->emplace_back();
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
coalesced_block_indices->back().emplace_back(block_idx);
|
|
1568
|
+
}
|
|
1074
1569
|
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1570
|
+
assert(read_reqs->empty());
|
|
1571
|
+
read_reqs->reserve(coalesced_block_indices->size());
|
|
1572
|
+
for (const auto& block_indices : *coalesced_block_indices) {
|
|
1573
|
+
assert(block_indices.size());
|
|
1574
|
+
const auto& first_block_handle = scan_block_handles[block_indices[0]];
|
|
1575
|
+
const auto& last_block_handle = scan_block_handles[block_indices.back()];
|
|
1576
|
+
|
|
1577
|
+
const auto start_offset = first_block_handle.offset();
|
|
1578
|
+
const auto end_offset =
|
|
1579
|
+
last_block_handle.offset() +
|
|
1580
|
+
BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
|
|
1581
|
+
#ifndef NDEBUG
|
|
1582
|
+
// Debug print for failing the assertion below.
|
|
1583
|
+
if (start_offset >= end_offset) {
|
|
1584
|
+
fprintf(stderr, "scan_block_handles: ");
|
|
1585
|
+
for (const auto& block : scan_block_handles) {
|
|
1586
|
+
fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
|
|
1587
|
+
block.offset(), block.size());
|
|
1588
|
+
}
|
|
1589
|
+
fprintf(stderr,
|
|
1590
|
+
"\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
|
|
1591
|
+
first_block_handle.offset(), first_block_handle.size());
|
|
1592
|
+
fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
|
|
1593
|
+
last_block_handle.offset(), last_block_handle.size());
|
|
1594
|
+
|
|
1595
|
+
fprintf(stderr, "coalesced_block_indices: ");
|
|
1596
|
+
for (const auto& b : *coalesced_block_indices) {
|
|
1597
|
+
fprintf(stderr, "[");
|
|
1598
|
+
for (const auto& block_idx : b) {
|
|
1599
|
+
fprintf(stderr, "%zu ", block_idx);
|
|
1078
1600
|
}
|
|
1601
|
+
fprintf(stderr, "] ");
|
|
1079
1602
|
}
|
|
1080
|
-
|
|
1603
|
+
fprintf(stderr, "\ncurrent blocks: ");
|
|
1604
|
+
for (const auto& block_idx : block_indices) {
|
|
1605
|
+
fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
|
|
1606
|
+
scan_block_handles[block_idx].offset(),
|
|
1607
|
+
scan_block_handles[block_idx].size());
|
|
1608
|
+
}
|
|
1609
|
+
fprintf(stderr, "\n");
|
|
1081
1610
|
}
|
|
1611
|
+
#endif // NDEBUG
|
|
1612
|
+
assert(end_offset > start_offset);
|
|
1082
1613
|
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1614
|
+
read_reqs->emplace_back();
|
|
1615
|
+
read_reqs->back().offset = start_offset;
|
|
1616
|
+
read_reqs->back().len = end_offset - start_offset;
|
|
1617
|
+
|
|
1618
|
+
if (multiscan_opts->use_async_io) {
|
|
1619
|
+
for (const auto& block_idx : block_indices) {
|
|
1620
|
+
(*block_idx_to_readreq_idx)[block_idx] = read_reqs->size() - 1;
|
|
1621
|
+
}
|
|
1089
1622
|
}
|
|
1623
|
+
}
|
|
1624
|
+
}
|
|
1090
1625
|
|
|
1091
|
-
|
|
1092
|
-
std::vector<
|
|
1093
|
-
|
|
1094
|
-
size_t
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1626
|
+
Status BlockBasedTableIterator::ExecuteIO(
|
|
1627
|
+
const std::vector<BlockHandle>& scan_block_handles,
|
|
1628
|
+
const MultiScanArgs* multiscan_opts,
|
|
1629
|
+
const std::vector<std::vector<size_t>>& coalesced_block_indices,
|
|
1630
|
+
std::vector<FSReadRequest>* read_reqs,
|
|
1631
|
+
std::vector<AsyncReadState>* async_states,
|
|
1632
|
+
std::vector<CachableEntry<Block>>* pinned_data_blocks_guard) {
|
|
1633
|
+
IOOptions io_opts;
|
|
1634
|
+
Status s;
|
|
1635
|
+
s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
|
|
1636
|
+
if (!s.ok()) {
|
|
1637
|
+
// Abort: PrepareIOOptions failed
|
|
1638
|
+
return s;
|
|
1639
|
+
}
|
|
1640
|
+
const bool direct_io = table_->get_rep()->file->use_direct_io();
|
|
1641
|
+
|
|
1642
|
+
if (multiscan_opts->use_async_io) {
|
|
1643
|
+
async_states->resize(read_reqs->size());
|
|
1644
|
+
for (size_t i = 0; i < read_reqs->size(); ++i) {
|
|
1645
|
+
auto& read_req = (*read_reqs)[i];
|
|
1646
|
+
auto& async_read = (*async_states)[i];
|
|
1647
|
+
|
|
1648
|
+
async_read.finished = false;
|
|
1649
|
+
async_read.offset = read_req.offset;
|
|
1650
|
+
async_read.block_indices = coalesced_block_indices[i];
|
|
1651
|
+
for (const auto idx : coalesced_block_indices[i]) {
|
|
1652
|
+
async_read.blocks.emplace_back(scan_block_handles[idx]);
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
if (direct_io) {
|
|
1656
|
+
read_req.scratch = nullptr;
|
|
1657
|
+
} else {
|
|
1658
|
+
async_read.buf.reset(new char[read_req.len]);
|
|
1659
|
+
read_req.scratch = async_read.buf.get();
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
auto cb = std::bind(&BlockBasedTableIterator::PrepareReadAsyncCallBack,
|
|
1663
|
+
this, std::placeholders::_1, std::placeholders::_2);
|
|
1664
|
+
// TODO: for mmap, io_handle will not be set but callback will already
|
|
1665
|
+
// be called.
|
|
1666
|
+
s = table_->get_rep()->file.get()->ReadAsync(
|
|
1667
|
+
read_req, io_opts, cb, &async_read, &async_read.io_handle,
|
|
1668
|
+
&async_read.del_fn, direct_io ? &async_read.aligned_buf : nullptr);
|
|
1669
|
+
if (!s.ok()) {
|
|
1670
|
+
#ifndef NDEBUG
|
|
1671
|
+
fprintf(stderr, "ReadAsync failed with %s\n", s.ToString().c_str());
|
|
1672
|
+
#endif
|
|
1673
|
+
assert(false);
|
|
1674
|
+
return s;
|
|
1675
|
+
}
|
|
1676
|
+
assert(async_read.io_handle);
|
|
1677
|
+
for (auto& req : *read_reqs) {
|
|
1678
|
+
if (!req.status.ok()) {
|
|
1679
|
+
assert(false);
|
|
1680
|
+
// Silence compiler warning about NRVO
|
|
1681
|
+
s = req.status;
|
|
1682
|
+
return s;
|
|
1683
|
+
}
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
} else {
|
|
1687
|
+
// Synchronous IO using MultiRead
|
|
1112
1688
|
std::unique_ptr<char[]> buf;
|
|
1113
|
-
|
|
1689
|
+
|
|
1114
1690
|
if (direct_io) {
|
|
1115
|
-
for (auto& read_req : read_reqs) {
|
|
1691
|
+
for (auto& read_req : *read_reqs) {
|
|
1116
1692
|
read_req.scratch = nullptr;
|
|
1117
1693
|
}
|
|
1118
1694
|
} else {
|
|
1119
1695
|
// TODO: optimize if FSSupportedOps::kFSBuffer is supported.
|
|
1696
|
+
size_t total_len = 0;
|
|
1697
|
+
for (const auto& req : *read_reqs) {
|
|
1698
|
+
total_len += req.len;
|
|
1699
|
+
}
|
|
1120
1700
|
buf.reset(new char[total_len]);
|
|
1121
1701
|
size_t offset = 0;
|
|
1122
|
-
for (auto& read_req : read_reqs) {
|
|
1702
|
+
for (auto& read_req : *read_reqs) {
|
|
1123
1703
|
read_req.scratch = buf.get() + offset;
|
|
1124
1704
|
offset += read_req.len;
|
|
1125
1705
|
}
|
|
1126
1706
|
}
|
|
1127
1707
|
|
|
1128
1708
|
AlignedBuf aligned_buf;
|
|
1129
|
-
s = table_->get_rep()->file
|
|
1130
|
-
|
|
1131
|
-
|
|
1709
|
+
s = table_->get_rep()->file->MultiRead(io_opts, read_reqs->data(),
|
|
1710
|
+
read_reqs->size(),
|
|
1711
|
+
direct_io ? &aligned_buf : nullptr);
|
|
1132
1712
|
if (!s.ok()) {
|
|
1133
|
-
return;
|
|
1713
|
+
return s;
|
|
1134
1714
|
}
|
|
1135
|
-
for (auto& req : read_reqs) {
|
|
1715
|
+
for (auto& req : *read_reqs) {
|
|
1136
1716
|
if (!req.status.ok()) {
|
|
1137
|
-
|
|
1717
|
+
// Silence compiler warning about NRVO
|
|
1718
|
+
s = req.status;
|
|
1719
|
+
return s;
|
|
1138
1720
|
}
|
|
1139
1721
|
}
|
|
1140
1722
|
|
|
1141
1723
|
// Init blocks and pin them in block cache.
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
const auto&
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
const auto block_offset_in_buffer = block.offset() - read_req.offset;
|
|
1152
|
-
|
|
1153
|
-
CacheAllocationPtr data =
|
|
1154
|
-
AllocateBlock(block_size_with_trailer, memory_allocator);
|
|
1155
|
-
memcpy(data.get(), read_req.result.data() + block_offset_in_buffer,
|
|
1156
|
-
block_size_with_trailer);
|
|
1157
|
-
BlockContents tmp_contents(std::move(data), block.size());
|
|
1158
|
-
|
|
1159
|
-
#ifndef NDEBUG
|
|
1160
|
-
tmp_contents.has_trailer =
|
|
1161
|
-
table_->get_rep()->footer.GetBlockTrailerSize() > 0;
|
|
1162
|
-
#endif
|
|
1163
|
-
assert(pinned_data_blocks_guard[block_idx].IsEmpty());
|
|
1164
|
-
s = table_->CreateAndPinBlockInCache<Block_kData>(
|
|
1165
|
-
read_options_, block, &tmp_contents,
|
|
1166
|
-
&(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
|
|
1724
|
+
assert(read_reqs->size() == coalesced_block_indices.size());
|
|
1725
|
+
for (size_t i = 0; i < coalesced_block_indices.size(); i++) {
|
|
1726
|
+
const auto& read_req = (*read_reqs)[i];
|
|
1727
|
+
for (const auto& block_idx : coalesced_block_indices[i]) {
|
|
1728
|
+
const auto& block = scan_block_handles[block_idx];
|
|
1729
|
+
|
|
1730
|
+
assert((*pinned_data_blocks_guard)[block_idx].IsEmpty());
|
|
1731
|
+
s = CreateAndPinBlockFromBuffer(block, read_req.offset, read_req.result,
|
|
1732
|
+
(*pinned_data_blocks_guard)[block_idx]);
|
|
1167
1733
|
if (!s.ok()) {
|
|
1734
|
+
assert(false);
|
|
1168
1735
|
// Abort: failed to create and pin block in cache
|
|
1169
|
-
return;
|
|
1736
|
+
return s;
|
|
1170
1737
|
}
|
|
1738
|
+
assert((*pinned_data_blocks_guard)[block_idx].GetValue());
|
|
1171
1739
|
}
|
|
1172
1740
|
}
|
|
1173
1741
|
}
|
|
1174
|
-
|
|
1175
|
-
// Successful Prepare, init related states so the iterator reads from prepared
|
|
1176
|
-
// blocks
|
|
1177
|
-
multi_scan_.reset(new MultiScanState(scan_opts,
|
|
1178
|
-
std::move(pinned_data_blocks_guard),
|
|
1179
|
-
std::move(block_ranges_per_scan)));
|
|
1180
|
-
is_index_at_curr_block_ = false;
|
|
1181
|
-
block_iter_points_to_real_block_ = false;
|
|
1742
|
+
return s;
|
|
1182
1743
|
}
|
|
1183
1744
|
|
|
1184
|
-
bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
|
|
1185
|
-
assert(multi_scan_);
|
|
1186
|
-
// This is a MultiScan and Preapre() has been called.
|
|
1187
|
-
//
|
|
1188
|
-
// Validate seek key with scan options
|
|
1189
|
-
if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
|
|
1190
|
-
multi_scan_.reset();
|
|
1191
|
-
} else if (!target) {
|
|
1192
|
-
// start key must be set for multi-scan
|
|
1193
|
-
multi_scan_.reset();
|
|
1194
|
-
} else if (user_comparator_.CompareWithoutTimestamp(
|
|
1195
|
-
ExtractUserKey(*target), /*a_has_ts=*/true,
|
|
1196
|
-
(*multi_scan_->scan_opts)[multi_scan_->next_scan_idx]
|
|
1197
|
-
.range.start.value(),
|
|
1198
|
-
/*b_has_ts=*/false) != 0) {
|
|
1199
|
-
// Unexpected seek key
|
|
1200
|
-
multi_scan_.reset();
|
|
1201
|
-
} else {
|
|
1202
|
-
auto [cur_scan_start_idx, cur_scan_end_idx] =
|
|
1203
|
-
multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx];
|
|
1204
|
-
// We should have the data block already loaded
|
|
1205
|
-
++multi_scan_->next_scan_idx;
|
|
1206
|
-
if (cur_scan_start_idx >= cur_scan_end_idx) {
|
|
1207
|
-
is_out_of_bound_ = true;
|
|
1208
|
-
assert(!Valid());
|
|
1209
|
-
return true;
|
|
1210
|
-
} else {
|
|
1211
|
-
is_out_of_bound_ = false;
|
|
1212
|
-
}
|
|
1213
|
-
|
|
1214
|
-
if (!block_iter_points_to_real_block_ ||
|
|
1215
|
-
multi_scan_->cur_data_block_idx != cur_scan_start_idx) {
|
|
1216
|
-
if (block_iter_points_to_real_block_) {
|
|
1217
|
-
// Should be scan in increasing key range.
|
|
1218
|
-
// All blocks before cur_data_block_idx_ are not pinned anymore.
|
|
1219
|
-
assert(multi_scan_->cur_data_block_idx < cur_scan_start_idx);
|
|
1220
|
-
}
|
|
1221
|
-
|
|
1222
|
-
ResetDataIter();
|
|
1223
|
-
// Note that the block_iter_ takes ownership of the pinned data block
|
|
1224
|
-
// TODO: we can delegate the clean up like with pinned_iters_mgr_ if
|
|
1225
|
-
// need to pin blocks longer.
|
|
1226
|
-
table_->NewDataBlockIterator<DataBlockIter>(
|
|
1227
|
-
read_options_, multi_scan_->pinned_data_blocks[cur_scan_start_idx],
|
|
1228
|
-
&block_iter_, Status::OK());
|
|
1229
|
-
}
|
|
1230
|
-
multi_scan_->cur_data_block_idx = cur_scan_start_idx;
|
|
1231
|
-
block_iter_points_to_real_block_ = true;
|
|
1232
|
-
block_iter_.Seek(*target);
|
|
1233
|
-
FindKeyForward();
|
|
1234
|
-
return true;
|
|
1235
|
-
}
|
|
1236
|
-
|
|
1237
|
-
return false;
|
|
1238
|
-
}
|
|
1239
|
-
|
|
1240
|
-
void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
|
|
1241
|
-
assert(multi_scan_);
|
|
1242
|
-
assert(multi_scan_->next_scan_idx >= 1);
|
|
1243
|
-
const auto cur_scan_end_idx = std::get<1>(
|
|
1244
|
-
multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
|
|
1245
|
-
do {
|
|
1246
|
-
if (!block_iter_.status().ok()) {
|
|
1247
|
-
return;
|
|
1248
|
-
}
|
|
1249
|
-
|
|
1250
|
-
if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
|
|
1251
|
-
// We don't ResetDataIter() here since next scan might be reading from
|
|
1252
|
-
// the same block. ResetDataIter() will free the underlying block cache
|
|
1253
|
-
// handle and we don't want the block to be unpinned.
|
|
1254
|
-
is_out_of_bound_ = true;
|
|
1255
|
-
assert(!Valid());
|
|
1256
|
-
return;
|
|
1257
|
-
}
|
|
1258
|
-
// Move to the next pinned data block
|
|
1259
|
-
ResetDataIter();
|
|
1260
|
-
++multi_scan_->cur_data_block_idx;
|
|
1261
|
-
table_->NewDataBlockIterator<DataBlockIter>(
|
|
1262
|
-
read_options_,
|
|
1263
|
-
multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx],
|
|
1264
|
-
&block_iter_, Status::OK());
|
|
1265
|
-
block_iter_points_to_real_block_ = true;
|
|
1266
|
-
block_iter_.SeekToFirst();
|
|
1267
|
-
} while (!block_iter_.Valid());
|
|
1268
|
-
}
|
|
1269
1745
|
} // namespace ROCKSDB_NAMESPACE
|