@nxtedition/rocksdb 15.4.1 → 16.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +70 -23
- package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
- package/deps/rocksdb/rocksdb/BUCK +42 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
- package/deps/rocksdb/rocksdb/Makefile +59 -32
- package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
- package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
- package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
- package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
- package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
- package/deps/rocksdb/rocksdb/db/builder.h +7 -0
- package/deps/rocksdb/rocksdb/db/c.cc +373 -57
- package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
- package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
- package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
- package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
- package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
- package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
- package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
- package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
- package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
- package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
- package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
- package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
- package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
- package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
- package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
- package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
- package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
- package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
- package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
- package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
- package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
- package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
- package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
- package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
- package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
- package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
- package/deps/rocksdb/rocksdb/env/env.cc +1 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
- package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
- package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
- package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
- package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
- package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
- package/deps/rocksdb/rocksdb/folly.mk +22 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
- package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
- package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
- package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
- package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
- package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
- package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
- package/deps/rocksdb/rocksdb/options/options.cc +5 -1
- package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
- package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
- package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
- package/deps/rocksdb/rocksdb/port/lang.h +4 -0
- package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
- package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
- package/deps/rocksdb/rocksdb/src.mk +12 -0
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
- package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
- package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
- package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
- package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
- package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
- package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
- package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
- package/deps/rocksdb/rocksdb/table/format.cc +27 -15
- package/deps/rocksdb/rocksdb/table/format.h +41 -15
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
- package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
- package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
- package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
- package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
- package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
- package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
- package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
- package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
- package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
- package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
- package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
- package/deps/rocksdb/rocksdb/util/coding.h +14 -27
- package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
- package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
- package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
- package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
- package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
- package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
- package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
- package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
- package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
- package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
- package/deps/rocksdb/rocksdb/util/math.h +3 -1
- package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
- package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
- package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
- package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
- package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
- package/deps/rocksdb/rocksdb/util/status.cc +3 -1
- package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
- package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
- package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
- package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
- package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
- package/deps/rocksdb/rocksdb.gyp +7 -0
- package/index.js +70 -10
- package/iterator.js +25 -3
- package/max_rev_operator.h +9 -5
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
|
@@ -0,0 +1,1099 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
3
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
5
|
+
|
|
6
|
+
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
7
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
8
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
9
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
10
|
+
//
|
|
11
|
+
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
12
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
13
|
+
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
14
|
+
|
|
15
|
+
#include "util/io_dispatcher_imp.h"
|
|
16
|
+
|
|
17
|
+
#include <deque>
|
|
18
|
+
#include <memory>
|
|
19
|
+
#include <unordered_map>
|
|
20
|
+
#include <unordered_set>
|
|
21
|
+
#include <vector>
|
|
22
|
+
|
|
23
|
+
#include "file/random_access_file_reader.h"
|
|
24
|
+
#include "monitoring/statistics_impl.h"
|
|
25
|
+
#include "port/port.h"
|
|
26
|
+
#include "rocksdb/file_system.h"
|
|
27
|
+
#include "rocksdb/io_dispatcher.h"
|
|
28
|
+
#include "rocksdb/options.h"
|
|
29
|
+
#include "rocksdb/status.h"
|
|
30
|
+
#include "table/block_based/block_based_table_reader.h"
|
|
31
|
+
#include "table/block_based/cachable_entry.h"
|
|
32
|
+
#include "table/block_based/reader_common.h"
|
|
33
|
+
#include "table/format.h"
|
|
34
|
+
#include "test_util/sync_point.h"
|
|
35
|
+
#include "util/mutexlock.h"
|
|
36
|
+
|
|
37
|
+
namespace ROCKSDB_NAMESPACE {
|
|
38
|
+
|
|
39
|
+
// IODispatcherImplData is the base that provides ReleaseMemory interface
|
|
40
|
+
// for ReadSets to call back when releasing blocks. Defined here so it's
|
|
41
|
+
// visible to ReadSet methods.
|
|
42
|
+
struct IODispatcherImplData {
|
|
43
|
+
virtual ~IODispatcherImplData() = default;
|
|
44
|
+
virtual void ReleaseMemory(size_t bytes) = 0;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
// Helper function to create and pin a block from a buffer
|
|
48
|
+
// Used by both ReadSet::PollAndProcessAsyncIO and IODispatcherImpl::Impl
|
|
49
|
+
static Status CreateAndPinBlockFromBuffer(
|
|
50
|
+
const std::shared_ptr<IOJob>& job, const BlockHandle& block,
|
|
51
|
+
uint64_t buffer_start_offset, const Slice& buffer_data,
|
|
52
|
+
CachableEntry<Block>& pinned_block_entry) {
|
|
53
|
+
auto* rep = job->table->get_rep();
|
|
54
|
+
|
|
55
|
+
// Get decompressor
|
|
56
|
+
UnownedPtr<Decompressor> decompressor = rep->decompressor.get();
|
|
57
|
+
CachableEntry<DecompressorDict> cached_dict;
|
|
58
|
+
|
|
59
|
+
if (rep->uncompression_dict_reader) {
|
|
60
|
+
Status s = rep->uncompression_dict_reader->GetOrReadUncompressionDictionary(
|
|
61
|
+
nullptr, job->job_options.read_options, nullptr, nullptr, &cached_dict);
|
|
62
|
+
if (!s.ok()) {
|
|
63
|
+
return s;
|
|
64
|
+
}
|
|
65
|
+
if (cached_dict.GetValue()) {
|
|
66
|
+
decompressor = cached_dict.GetValue()->decompressor_.get();
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Create block from buffer data
|
|
71
|
+
const auto block_size_with_trailer =
|
|
72
|
+
BlockBasedTable::BlockSizeWithTrailer(block);
|
|
73
|
+
const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
|
|
74
|
+
|
|
75
|
+
CacheAllocationPtr data = AllocateBlock(
|
|
76
|
+
block_size_with_trailer, GetMemoryAllocator(rep->table_options));
|
|
77
|
+
memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
|
|
78
|
+
block_size_with_trailer);
|
|
79
|
+
BlockContents tmp_contents(std::move(data), block.size());
|
|
80
|
+
|
|
81
|
+
#ifndef NDEBUG
|
|
82
|
+
tmp_contents.has_trailer = rep->footer.GetBlockTrailerSize() > 0;
|
|
83
|
+
#endif
|
|
84
|
+
|
|
85
|
+
return job->table->CreateAndPinBlockInCache<Block_kData>(
|
|
86
|
+
job->job_options.read_options, block, decompressor, &tmp_contents,
|
|
87
|
+
&pinned_block_entry.As<Block_kData>());
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// State for async IO operations (implementation detail)
|
|
91
|
+
struct AsyncIOState {
|
|
92
|
+
AsyncIOState() : offset(static_cast<uint64_t>(-1)) {}
|
|
93
|
+
~AsyncIOState() { read_req.status.PermitUncheckedError(); }
|
|
94
|
+
|
|
95
|
+
AsyncIOState(const AsyncIOState&) = delete;
|
|
96
|
+
AsyncIOState& operator=(const AsyncIOState&) = delete;
|
|
97
|
+
AsyncIOState(AsyncIOState&&) = default;
|
|
98
|
+
AsyncIOState& operator=(AsyncIOState&&) = default;
|
|
99
|
+
|
|
100
|
+
std::unique_ptr<char[]> buf;
|
|
101
|
+
AlignedBuf aligned_buf;
|
|
102
|
+
void* io_handle = nullptr;
|
|
103
|
+
IOHandleDeleter del_fn;
|
|
104
|
+
uint64_t offset;
|
|
105
|
+
std::vector<size_t> block_indices;
|
|
106
|
+
std::vector<BlockHandle> blocks;
|
|
107
|
+
FSReadRequest read_req;
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
// ReadSet destructor - clean up IO handles
|
|
111
|
+
// Must call AbortIO before deleting handles to avoid use-after-free when
|
|
112
|
+
// io_uring completions arrive for deleted handles.
|
|
113
|
+
ReadSet::~ReadSet() {
|
|
114
|
+
// Release memory for any blocks still pinned
|
|
115
|
+
// Note: block_sizes_[i] is only set for async IO reads where memory
|
|
116
|
+
// limiting applies. For sync reads, block_sizes_ remains 0, so this
|
|
117
|
+
// loop is effectively a no-op for sync reads.
|
|
118
|
+
if (auto dispatcher_data = dispatcher_data_.lock()) {
|
|
119
|
+
for (size_t i = 0; i < block_sizes_.size(); ++i) {
|
|
120
|
+
if (block_sizes_[i] > 0 && pinned_blocks_[i].GetValue()) {
|
|
121
|
+
dispatcher_data->ReleaseMemory(block_sizes_[i]);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (async_io_map_.empty()) {
|
|
127
|
+
return;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Collect unique pending IO handles (multiple block indices may share the
|
|
131
|
+
// same async_state due to coalescing)
|
|
132
|
+
std::vector<void*> pending_handles;
|
|
133
|
+
std::unordered_set<void*> seen_handles;
|
|
134
|
+
for (auto& pair : async_io_map_) {
|
|
135
|
+
auto& async_state = pair.second;
|
|
136
|
+
if (async_state->io_handle != nullptr &&
|
|
137
|
+
seen_handles.find(async_state->io_handle) == seen_handles.end()) {
|
|
138
|
+
pending_handles.push_back(async_state->io_handle);
|
|
139
|
+
seen_handles.insert(async_state->io_handle);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Abort all pending IO operations before deleting handles
|
|
144
|
+
if (!pending_handles.empty() && fs_) {
|
|
145
|
+
// AbortIO cancels pending requests and waits for completions
|
|
146
|
+
IOStatus s = fs_->AbortIO(pending_handles);
|
|
147
|
+
(void)s; // Ignore errors in destructor
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Now safe to delete the handles
|
|
151
|
+
for (auto& pair : async_io_map_) {
|
|
152
|
+
auto& async_state = pair.second;
|
|
153
|
+
if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
|
|
154
|
+
async_state->del_fn(async_state->io_handle);
|
|
155
|
+
async_state->io_handle = nullptr;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Main Read() method - transparently handles cache, async IO, and sync reads
|
|
161
|
+
Status ReadSet::ReadIndex(size_t block_index, CachableEntry<Block>* out) {
|
|
162
|
+
// Bounds check
|
|
163
|
+
if (block_index >= pinned_blocks_.size()) {
|
|
164
|
+
return Status::InvalidArgument("Block index out of range");
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Case 1: Block is already available (from cache or sync read during
|
|
168
|
+
// SubmitJob)
|
|
169
|
+
if (pinned_blocks_[block_index].GetValue()) {
|
|
170
|
+
*out = std::move(pinned_blocks_[block_index]);
|
|
171
|
+
// Release memory accounting for prefetched blocks. After moving the value
|
|
172
|
+
// out, ReleaseBlock() and the destructor check pinned_blocks_.GetValue()
|
|
173
|
+
// which will be null, so they won't release memory again.
|
|
174
|
+
if (block_index < block_sizes_.size() && block_sizes_[block_index] > 0) {
|
|
175
|
+
if (auto dispatcher_data = dispatcher_data_.lock()) {
|
|
176
|
+
dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
|
|
177
|
+
}
|
|
178
|
+
block_sizes_[block_index] = 0;
|
|
179
|
+
}
|
|
180
|
+
// Note: Statistics for this block were already counted during SubmitJob
|
|
181
|
+
// (either as cache hit or sync read)
|
|
182
|
+
return Status::OK();
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Case 2: Block has async IO in progress - poll and process
|
|
186
|
+
if (job_->job_options.read_options.async_io) {
|
|
187
|
+
auto it = async_io_map_.find(block_index);
|
|
188
|
+
if (it != async_io_map_.end()) {
|
|
189
|
+
// Get the number of blocks in this coalesced async request BEFORE polling
|
|
190
|
+
// (since PollAndProcessAsyncIO will remove entries from the map)
|
|
191
|
+
size_t num_blocks_in_request = it->second->block_indices.size();
|
|
192
|
+
|
|
193
|
+
if (Status s = PollAndProcessAsyncIO(it->second); !s.ok()) {
|
|
194
|
+
return s;
|
|
195
|
+
}
|
|
196
|
+
// Count all blocks that were read in this async request
|
|
197
|
+
num_async_reads_ += num_blocks_in_request;
|
|
198
|
+
|
|
199
|
+
// After polling, the block should be in pinned_blocks_
|
|
200
|
+
if (pinned_blocks_[block_index].GetValue()) {
|
|
201
|
+
*out = std::move(pinned_blocks_[block_index]);
|
|
202
|
+
// Release memory accounting (same as case 1 above)
|
|
203
|
+
if (block_index < block_sizes_.size() &&
|
|
204
|
+
block_sizes_[block_index] > 0) {
|
|
205
|
+
if (auto dispatcher_data = dispatcher_data_.lock()) {
|
|
206
|
+
dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
|
|
207
|
+
}
|
|
208
|
+
block_sizes_[block_index] = 0;
|
|
209
|
+
}
|
|
210
|
+
return Status::OK();
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return Status::IOError("Failed to process async IO result");
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Case 3: Block needs synchronous read (pending or never-dispatched blocks).
|
|
218
|
+
// No ReleaseMemory() needed here because blocks reaching this path never had
|
|
219
|
+
// TryAcquireMemory() called — they were either pending prefetch or skipped
|
|
220
|
+
// during SubmitJob. block_sizes_[block_index] may be > 0 (set during
|
|
221
|
+
// SubmitJob for all uncached blocks) but that does not imply memory was
|
|
222
|
+
// acquired.
|
|
223
|
+
RemoveFromPending(block_index);
|
|
224
|
+
|
|
225
|
+
Status s = SyncRead(block_index);
|
|
226
|
+
if (s.ok()) {
|
|
227
|
+
*out = std::move(pinned_blocks_[block_index]);
|
|
228
|
+
num_sync_reads_++;
|
|
229
|
+
}
|
|
230
|
+
return s;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
Status ReadSet::ReadOffset(size_t offset, CachableEntry<Block>* out) {
|
|
234
|
+
if (sorted_block_indices_.empty()) {
|
|
235
|
+
return Status::InvalidArgument("ReadSet not initialized");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Use binary search on the sorted index to find the block containing offset.
|
|
239
|
+
// sorted_block_indices_ contains original indices sorted by block offset.
|
|
240
|
+
const auto& block_handles = job_->block_handles;
|
|
241
|
+
|
|
242
|
+
// Binary search for the first block whose offset is > offset, then back up
|
|
243
|
+
auto it = std::upper_bound(sorted_block_indices_.begin(),
|
|
244
|
+
sorted_block_indices_.end(), offset,
|
|
245
|
+
[&block_handles](size_t off, size_t idx) {
|
|
246
|
+
return off < block_handles[idx].offset();
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
// If it == begin(), offset is before all blocks
|
|
250
|
+
if (it == sorted_block_indices_.begin()) {
|
|
251
|
+
return Status::InvalidArgument("Offset not found in any block");
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Back up to the candidate block (largest offset <= our offset)
|
|
255
|
+
--it;
|
|
256
|
+
size_t candidate_idx = *it;
|
|
257
|
+
const auto& handle = block_handles[candidate_idx];
|
|
258
|
+
|
|
259
|
+
// Check if offset falls within this block
|
|
260
|
+
if (offset >= handle.offset() && offset < (handle.offset() + handle.size())) {
|
|
261
|
+
return ReadIndex(candidate_idx, out);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return Status::InvalidArgument("Offset not found in any block");
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
void ReadSet::ReleaseBlock(size_t block_index) {
|
|
268
|
+
if (block_index >= pinned_blocks_.size()) {
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Remove from pending if applicable
|
|
273
|
+
RemoveFromPending(block_index);
|
|
274
|
+
|
|
275
|
+
// Release memory BEFORE unpinning
|
|
276
|
+
// Note: block_sizes_[idx] is only set for async IO reads where memory
|
|
277
|
+
// limiting applies. For sync reads, block_sizes_ remains 0, so this
|
|
278
|
+
// check implicitly skips ReleaseMemory for sync reads.
|
|
279
|
+
if (pinned_blocks_[block_index].GetValue() &&
|
|
280
|
+
block_index < block_sizes_.size() && block_sizes_[block_index] > 0) {
|
|
281
|
+
if (auto dispatcher_data = dispatcher_data_.lock()) {
|
|
282
|
+
dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
|
|
283
|
+
}
|
|
284
|
+
block_sizes_[block_index] = 0; // Prevent double-release
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Unpin the block from cache
|
|
288
|
+
pinned_blocks_[block_index].Reset();
|
|
289
|
+
// Clean up any pending async IO for this block
|
|
290
|
+
async_io_map_.erase(block_index);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
bool ReadSet::IsBlockAvailable(size_t block_index) const {
|
|
294
|
+
if (block_index >= pinned_blocks_.size()) {
|
|
295
|
+
return false;
|
|
296
|
+
}
|
|
297
|
+
// Block is available if it hasn't been released (still has a value or
|
|
298
|
+
// has pending async IO)
|
|
299
|
+
return pinned_blocks_[block_index].GetValue() != nullptr ||
|
|
300
|
+
async_io_map_.find(block_index) != async_io_map_.end();
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Poll and process async IO for a specific block
|
|
304
|
+
Status ReadSet::PollAndProcessAsyncIO(
|
|
305
|
+
const std::shared_ptr<AsyncIOState>& async_state) {
|
|
306
|
+
auto* rep = job_->table->get_rep();
|
|
307
|
+
|
|
308
|
+
// Poll for IO completion using FileSystem Poll API
|
|
309
|
+
std::vector<void*> io_handles = {async_state->io_handle};
|
|
310
|
+
IOStatus io_s = rep->ioptions.env->GetFileSystem()->Poll(io_handles, 1);
|
|
311
|
+
if (!io_s.ok()) {
|
|
312
|
+
return io_s;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Check for read errors
|
|
316
|
+
if (!async_state->read_req.status.ok()) {
|
|
317
|
+
return async_state->read_req.status;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Use the result slice from the callback which has been correctly set
|
|
321
|
+
// with any necessary alignment adjustments for direct IO
|
|
322
|
+
const Slice& buffer_data = async_state->read_req.result;
|
|
323
|
+
|
|
324
|
+
// Process all blocks in this async request
|
|
325
|
+
for (size_t i = 0; i < async_state->block_indices.size(); ++i) {
|
|
326
|
+
const size_t idx = async_state->block_indices[i];
|
|
327
|
+
const auto& block_handle = async_state->blocks[i];
|
|
328
|
+
|
|
329
|
+
Status s =
|
|
330
|
+
CreateAndPinBlockFromBuffer(job_, block_handle, async_state->offset,
|
|
331
|
+
buffer_data, pinned_blocks_[idx]);
|
|
332
|
+
if (!s.ok()) {
|
|
333
|
+
return s;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Clean up IO handle
|
|
338
|
+
if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
|
|
339
|
+
async_state->del_fn(async_state->io_handle);
|
|
340
|
+
async_state->io_handle = nullptr;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Remove from map - all blocks in this request have been processed
|
|
344
|
+
// Store indices in a temporary vector to avoid iterator invalidation
|
|
345
|
+
std::vector<size_t> indices_to_remove = async_state->block_indices;
|
|
346
|
+
for (const auto idx : indices_to_remove) {
|
|
347
|
+
async_io_map_.erase(idx);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return Status::OK();
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Perform synchronous read for a specific block
|
|
354
|
+
// This performs a direct synchronous read from disk when the block is not in
|
|
355
|
+
// cache
|
|
356
|
+
Status ReadSet::SyncRead(size_t block_index) {
|
|
357
|
+
const auto& block_handle = job_->block_handles[block_index];
|
|
358
|
+
auto* rep = job_->table->get_rep();
|
|
359
|
+
|
|
360
|
+
// Get dictionary-aware decompressor if available
|
|
361
|
+
UnownedPtr<Decompressor> decompressor = rep->decompressor.get();
|
|
362
|
+
CachableEntry<DecompressorDict> cached_dict;
|
|
363
|
+
if (rep->uncompression_dict_reader) {
|
|
364
|
+
Status s = rep->uncompression_dict_reader->GetOrReadUncompressionDictionary(
|
|
365
|
+
nullptr, job_->job_options.read_options, nullptr, nullptr,
|
|
366
|
+
&cached_dict);
|
|
367
|
+
if (!s.ok()) {
|
|
368
|
+
return s;
|
|
369
|
+
}
|
|
370
|
+
if (cached_dict.GetValue()) {
|
|
371
|
+
decompressor = cached_dict.GetValue()->decompressor_.get();
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return job_->table->RetrieveBlock<Block_kData>(
|
|
376
|
+
/*prefetch_buffer=*/nullptr, job_->job_options.read_options, block_handle,
|
|
377
|
+
decompressor, &pinned_blocks_[block_index].As<Block_kData>(),
|
|
378
|
+
/*get_context=*/nullptr, /*lookup_context=*/nullptr,
|
|
379
|
+
/*for_compaction=*/false, /*use_cache=*/true,
|
|
380
|
+
/*async_read=*/false, /*use_block_cache_for_lookup=*/true);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// A pre-coalesced group of blocks for prefetching
|
|
384
|
+
struct CoalescedPrefetchGroup {
|
|
385
|
+
std::vector<size_t> block_indices; // Blocks in this group (sorted by offset)
|
|
386
|
+
size_t total_bytes = 0; // Total bytes for this IO
|
|
387
|
+
};
|
|
388
|
+
|
|
389
|
+
// State for a pending memory request waiting to be granted
|
|
390
|
+
// Groups are pre-coalesced at queue time for efficient dispatch
|
|
391
|
+
struct PendingPrefetchRequest {
|
|
392
|
+
std::weak_ptr<ReadSet> read_set;
|
|
393
|
+
std::shared_ptr<IOJob> job;
|
|
394
|
+
|
|
395
|
+
// Pre-coalesced groups ready for dispatch (ordered by first block index)
|
|
396
|
+
std::deque<CoalescedPrefetchGroup> coalesced_groups;
|
|
397
|
+
|
|
398
|
+
// Individual block indices still pending (for RemoveFromPending lookup)
|
|
399
|
+
std::unordered_set<size_t> block_indices_to_prefetch;
|
|
400
|
+
|
|
401
|
+
std::atomic<size_t> pending_bytes_{0}; // Track remaining bytes
|
|
402
|
+
mutable port::Mutex groups_mutex_; // Protects groups and set modifications
|
|
403
|
+
};
|
|
404
|
+
|
|
405
|
+
// Remove a block from pending prefetch (called when block is read or released)
|
|
406
|
+
void ReadSet::RemoveFromPending(size_t block_index) {
|
|
407
|
+
if (!pending_prefetch_flags_ || block_index >= pending_prefetch_flags_size_) {
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Atomic exchange - returns true only if it was previously true
|
|
412
|
+
if (!pending_prefetch_flags_[block_index].exchange(false)) {
|
|
413
|
+
return; // Already removed or never pending
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if (pending_request_) {
|
|
417
|
+
MutexLock lock(&pending_request_->groups_mutex_);
|
|
418
|
+
pending_request_->block_indices_to_prefetch.erase(block_index);
|
|
419
|
+
pending_request_->pending_bytes_ -= block_sizes_[block_index];
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// IODispatcherImpl::Impl inherits from IODispatcherImplData
|
|
424
|
+
struct IODispatcherImpl::Impl : public IODispatcherImplData,
|
|
425
|
+
public std::enable_shared_from_this<Impl> {
|
|
426
|
+
explicit Impl(const IODispatcherOptions& options);
|
|
427
|
+
~Impl() override;
|
|
428
|
+
|
|
429
|
+
// Non-copyable and non-movable
|
|
430
|
+
Impl(const Impl&) = delete;
|
|
431
|
+
Impl& operator=(const Impl&) = delete;
|
|
432
|
+
Impl(Impl&&) = delete;
|
|
433
|
+
Impl& operator=(Impl&&) = delete;
|
|
434
|
+
|
|
435
|
+
Status SubmitJob(const std::shared_ptr<IOJob>& job,
|
|
436
|
+
std::shared_ptr<ReadSet>* read_set);
|
|
437
|
+
|
|
438
|
+
// Memory management methods - non-blocking
|
|
439
|
+
bool TryAcquireMemory(size_t bytes);
|
|
440
|
+
void ReleaseMemory(size_t bytes) override;
|
|
441
|
+
|
|
442
|
+
// Memory limiting state
|
|
443
|
+
size_t max_prefetch_memory_bytes_ = 0;
|
|
444
|
+
std::atomic<size_t> memory_used_{0}; // Atomic for lock-free accounting
|
|
445
|
+
std::atomic<bool> has_pending_requests_{false}; // Fast-path check
|
|
446
|
+
port::Mutex memory_mutex_; // Only for pending_prefetch_queue_ access
|
|
447
|
+
std::deque<std::shared_ptr<PendingPrefetchRequest>> pending_prefetch_queue_;
|
|
448
|
+
Statistics* statistics_ = nullptr;
|
|
449
|
+
|
|
450
|
+
private:
|
|
451
|
+
void PrepareIORequests(
|
|
452
|
+
const std::shared_ptr<IOJob>& job,
|
|
453
|
+
const std::vector<size_t>& block_indices_to_read,
|
|
454
|
+
const std::vector<BlockHandle>& block_handles,
|
|
455
|
+
std::vector<FSReadRequest>* read_reqs,
|
|
456
|
+
std::vector<std::vector<size_t>>* coalesced_block_indices);
|
|
457
|
+
|
|
458
|
+
// Surface actual async IO errors to caller, but allow fallback for
|
|
459
|
+
// unsupported cases. Returns block indices that need sync fallback.
|
|
460
|
+
std::vector<size_t> ExecuteAsyncIO(
|
|
461
|
+
const std::shared_ptr<IOJob>& job,
|
|
462
|
+
const std::shared_ptr<ReadSet>& read_set,
|
|
463
|
+
std::vector<FSReadRequest>& read_reqs,
|
|
464
|
+
const std::vector<std::vector<size_t>>& coalesced_block_indices,
|
|
465
|
+
Status* out_status);
|
|
466
|
+
|
|
467
|
+
Status ExecuteSyncIO(
|
|
468
|
+
const std::shared_ptr<IOJob>& job,
|
|
469
|
+
const std::shared_ptr<ReadSet>& read_set,
|
|
470
|
+
std::vector<FSReadRequest>& read_reqs,
|
|
471
|
+
const std::vector<std::vector<size_t>>& coalesced_block_indices);
|
|
472
|
+
|
|
473
|
+
// Try to dispatch pending prefetch requests when memory becomes available
|
|
474
|
+
void TryDispatchPendingPrefetches();
|
|
475
|
+
|
|
476
|
+
// Dispatch prefetch for a specific ReadSet (called when memory is available)
|
|
477
|
+
void DispatchPrefetch(const std::shared_ptr<ReadSet>& read_set,
|
|
478
|
+
const std::shared_ptr<IOJob>& job,
|
|
479
|
+
const std::vector<size_t>& block_indices);
|
|
480
|
+
|
|
481
|
+
// Pre-coalesce blocks into groups, respecting max_group_bytes size limit.
|
|
482
|
+
// Returns groups ordered by first block index (earlier blocks first).
|
|
483
|
+
std::vector<CoalescedPrefetchGroup> PreCoalesceBlocks(
|
|
484
|
+
const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
|
|
485
|
+
const std::vector<size_t>& block_indices, size_t max_group_bytes);
|
|
486
|
+
};
|
|
487
|
+
|
|
488
|
+
IODispatcherImpl::Impl::Impl(const IODispatcherOptions& options)
|
|
489
|
+
: max_prefetch_memory_bytes_(options.max_prefetch_memory_bytes),
|
|
490
|
+
statistics_(options.statistics) {}
|
|
491
|
+
|
|
492
|
+
IODispatcherImpl::Impl::~Impl() {}
|
|
493
|
+
|
|
494
|
+
bool IODispatcherImpl::Impl::TryAcquireMemory(size_t bytes) {
|
|
495
|
+
if (max_prefetch_memory_bytes_ == 0) {
|
|
496
|
+
return true; // No limit configured
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Lock-free memory acquisition using compare-exchange
|
|
500
|
+
size_t current = memory_used_.load(std::memory_order_relaxed);
|
|
501
|
+
while (true) {
|
|
502
|
+
if (current + bytes > max_prefetch_memory_bytes_) {
|
|
503
|
+
// Not enough memory - caller should queue for later
|
|
504
|
+
RecordTick(statistics_, PREFETCH_MEMORY_REQUESTS_BLOCKED);
|
|
505
|
+
return false;
|
|
506
|
+
}
|
|
507
|
+
if (memory_used_.compare_exchange_weak(current, current + bytes,
|
|
508
|
+
std::memory_order_release,
|
|
509
|
+
std::memory_order_relaxed)) {
|
|
510
|
+
RecordTick(statistics_, PREFETCH_MEMORY_BYTES_GRANTED, bytes);
|
|
511
|
+
return true;
|
|
512
|
+
}
|
|
513
|
+
// current is updated by compare_exchange_weak on failure, retry
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
void IODispatcherImpl::Impl::ReleaseMemory(size_t bytes) {
|
|
518
|
+
if (max_prefetch_memory_bytes_ == 0) {
|
|
519
|
+
return; // No limit configured
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// Lock-free memory release using atomic fetch_sub
|
|
523
|
+
size_t old_val = memory_used_.fetch_sub(bytes, std::memory_order_release);
|
|
524
|
+
assert(old_val >= bytes);
|
|
525
|
+
(void)old_val; // Suppress unused warning in release builds
|
|
526
|
+
RecordTick(statistics_, PREFETCH_MEMORY_BYTES_RELEASED, bytes);
|
|
527
|
+
|
|
528
|
+
// Fast-path: skip dispatch attempt if no pending requests
|
|
529
|
+
// This avoids mutex contention in the common single-threaded iterator case
|
|
530
|
+
if (!has_pending_requests_.load(std::memory_order_acquire)) {
|
|
531
|
+
return;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// Try to dispatch pending prefetches now that memory is available
|
|
535
|
+
TryDispatchPendingPrefetches();
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
void IODispatcherImpl::Impl::TryDispatchPendingPrefetches() {
|
|
539
|
+
// Process pending prefetch requests - dispatch entire coalesced groups
|
|
540
|
+
while (true) {
|
|
541
|
+
std::shared_ptr<PendingPrefetchRequest> pending;
|
|
542
|
+
|
|
543
|
+
{
|
|
544
|
+
MutexLock lock(&memory_mutex_);
|
|
545
|
+
if (pending_prefetch_queue_.empty()) {
|
|
546
|
+
has_pending_requests_.store(false, std::memory_order_release);
|
|
547
|
+
return;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Get the next pending request
|
|
551
|
+
pending = std::move(pending_prefetch_queue_.front());
|
|
552
|
+
pending_prefetch_queue_.pop_front();
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// Check if the ReadSet is still alive
|
|
556
|
+
auto read_set = pending->read_set.lock();
|
|
557
|
+
if (!read_set) {
|
|
558
|
+
continue; // ReadSet was destroyed, skip this request
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// Try to acquire memory for coalesced groups (entire groups at a time)
|
|
562
|
+
std::vector<size_t> blocks_to_dispatch;
|
|
563
|
+
bool has_remaining_groups = false;
|
|
564
|
+
|
|
565
|
+
{
|
|
566
|
+
MutexLock lock(&pending->groups_mutex_);
|
|
567
|
+
|
|
568
|
+
while (!pending->coalesced_groups.empty()) {
|
|
569
|
+
auto& group = pending->coalesced_groups.front();
|
|
570
|
+
|
|
571
|
+
// Filter out blocks that were already read (not in pending set anymore)
|
|
572
|
+
std::vector<size_t> remaining_blocks;
|
|
573
|
+
size_t remaining_bytes = 0;
|
|
574
|
+
for (size_t idx : group.block_indices) {
|
|
575
|
+
if (pending->block_indices_to_prefetch.count(idx) > 0) {
|
|
576
|
+
remaining_blocks.push_back(idx);
|
|
577
|
+
remaining_bytes += read_set->block_sizes_[idx];
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// Skip empty groups (all blocks were already read)
|
|
582
|
+
if (remaining_blocks.empty()) {
|
|
583
|
+
pending->coalesced_groups.pop_front();
|
|
584
|
+
continue;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
// Try to acquire memory for remaining blocks only
|
|
588
|
+
if (TryAcquireMemory(remaining_bytes)) {
|
|
589
|
+
// Add all remaining blocks from this group to dispatch
|
|
590
|
+
for (size_t idx : remaining_blocks) {
|
|
591
|
+
blocks_to_dispatch.push_back(idx);
|
|
592
|
+
pending->block_indices_to_prefetch.erase(idx);
|
|
593
|
+
}
|
|
594
|
+
pending->pending_bytes_ -= remaining_bytes;
|
|
595
|
+
pending->coalesced_groups.pop_front();
|
|
596
|
+
} else {
|
|
597
|
+
// Not enough memory for this group - update with remaining blocks
|
|
598
|
+
group.block_indices = std::move(remaining_blocks);
|
|
599
|
+
group.total_bytes = remaining_bytes;
|
|
600
|
+
has_remaining_groups = true;
|
|
601
|
+
break;
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// Save job before potential move of pending
|
|
607
|
+
auto job = pending->job;
|
|
608
|
+
|
|
609
|
+
// Requeue if groups remain
|
|
610
|
+
if (has_remaining_groups) {
|
|
611
|
+
MutexLock lock(&memory_mutex_);
|
|
612
|
+
pending_prefetch_queue_.push_front(std::move(pending));
|
|
613
|
+
} else {
|
|
614
|
+
// All groups dispatched, clear pending state
|
|
615
|
+
read_set->pending_request_.reset();
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
// Clear pending flags for dispatched blocks
|
|
619
|
+
if (read_set->pending_prefetch_flags_) {
|
|
620
|
+
for (size_t idx : blocks_to_dispatch) {
|
|
621
|
+
if (idx < read_set->pending_prefetch_flags_size_) {
|
|
622
|
+
read_set->pending_prefetch_flags_[idx].store(false);
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
// Dispatch acquired blocks
|
|
628
|
+
if (!blocks_to_dispatch.empty()) {
|
|
629
|
+
DispatchPrefetch(read_set, job, blocks_to_dispatch);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// If we dispatched nothing, stop (no memory available for any group)
|
|
633
|
+
if (blocks_to_dispatch.empty()) {
|
|
634
|
+
return;
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
void IODispatcherImpl::Impl::DispatchPrefetch(
|
|
640
|
+
const std::shared_ptr<ReadSet>& read_set, const std::shared_ptr<IOJob>& job,
|
|
641
|
+
const std::vector<size_t>& block_indices) {
|
|
642
|
+
// Sync point for testing partial prefetch - passes number of blocks being
|
|
643
|
+
// dispatched
|
|
644
|
+
TEST_SYNC_POINT_CALLBACK("IODispatcherImpl::DispatchPrefetch:BlockCount",
|
|
645
|
+
const_cast<std::vector<size_t>*>(&block_indices));
|
|
646
|
+
|
|
647
|
+
// Prepare and execute IO for the given blocks
|
|
648
|
+
std::vector<FSReadRequest> read_reqs;
|
|
649
|
+
std::vector<std::vector<size_t>> coalesced_block_indices;
|
|
650
|
+
PrepareIORequests(job, block_indices, job->block_handles, &read_reqs,
|
|
651
|
+
&coalesced_block_indices);
|
|
652
|
+
|
|
653
|
+
if (job->job_options.read_options.async_io) {
|
|
654
|
+
Status async_status;
|
|
655
|
+
std::vector<size_t> fallback_indices = ExecuteAsyncIO(
|
|
656
|
+
job, read_set, read_reqs, coalesced_block_indices, &async_status);
|
|
657
|
+
|
|
658
|
+
// For blocks where async is not supported, do sync IO
|
|
659
|
+
if (!fallback_indices.empty()) {
|
|
660
|
+
std::vector<FSReadRequest> sync_read_reqs;
|
|
661
|
+
std::vector<std::vector<size_t>> sync_coalesced_indices;
|
|
662
|
+
PrepareIORequests(job, fallback_indices, job->block_handles,
|
|
663
|
+
&sync_read_reqs, &sync_coalesced_indices);
|
|
664
|
+
// Prefetch errors are ignored - user will get the error when reading
|
|
665
|
+
Status s =
|
|
666
|
+
ExecuteSyncIO(job, read_set, sync_read_reqs, sync_coalesced_indices);
|
|
667
|
+
s.PermitUncheckedError();
|
|
668
|
+
read_set->num_sync_reads_ += fallback_indices.size();
|
|
669
|
+
}
|
|
670
|
+
// Async errors are also ignored - user will get the error when reading
|
|
671
|
+
async_status.PermitUncheckedError();
|
|
672
|
+
} else {
|
|
673
|
+
// Prefetch errors are ignored - user will get the error when reading
|
|
674
|
+
Status s = ExecuteSyncIO(job, read_set, read_reqs, coalesced_block_indices);
|
|
675
|
+
s.PermitUncheckedError();
|
|
676
|
+
read_set->num_sync_reads_ += block_indices.size();
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
|
|
681
|
+
std::shared_ptr<ReadSet>* read_set) {
|
|
682
|
+
if (!read_set) {
|
|
683
|
+
return Status::InvalidArgument("read_set output parameter is null");
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
auto rs = std::make_shared<ReadSet>();
|
|
687
|
+
|
|
688
|
+
// Initialize ReadSet
|
|
689
|
+
rs->job_ = job;
|
|
690
|
+
rs->fs_ = job->table->get_rep()->ioptions.env->GetFileSystem();
|
|
691
|
+
rs->pinned_blocks_.resize(job->block_handles.size());
|
|
692
|
+
rs->block_sizes_.resize(job->block_handles.size(), 0);
|
|
693
|
+
|
|
694
|
+
// Build sorted index for O(log n) ReadOffset lookups via binary search.
|
|
695
|
+
// sorted_block_indices_[i] = original index of i-th smallest block by offset.
|
|
696
|
+
rs->sorted_block_indices_.resize(job->block_handles.size());
|
|
697
|
+
for (size_t i = 0; i < job->block_handles.size(); ++i) {
|
|
698
|
+
rs->sorted_block_indices_[i] = i;
|
|
699
|
+
}
|
|
700
|
+
std::sort(rs->sorted_block_indices_.begin(), rs->sorted_block_indices_.end(),
|
|
701
|
+
[&job](size_t a, size_t b) {
|
|
702
|
+
return job->block_handles[a].offset() <
|
|
703
|
+
job->block_handles[b].offset();
|
|
704
|
+
});
|
|
705
|
+
|
|
706
|
+
// Step 1: Check cache and pin cached blocks
|
|
707
|
+
std::vector<size_t> block_indices_to_read;
|
|
708
|
+
|
|
709
|
+
for (size_t i = 0; i < job->block_handles.size(); ++i) {
|
|
710
|
+
const auto& data_block_handle = job->block_handles[i];
|
|
711
|
+
|
|
712
|
+
// Lookup and pin block in cache
|
|
713
|
+
Status s = job->table->LookupAndPinBlocksInCache<Block_kData>(
|
|
714
|
+
job->job_options.read_options, data_block_handle,
|
|
715
|
+
&(rs->pinned_blocks_)[i].As<Block_kData>());
|
|
716
|
+
|
|
717
|
+
if (!s.ok()) {
|
|
718
|
+
continue;
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
if (!(rs->pinned_blocks_)[i].GetValue()) {
|
|
722
|
+
// Block not in cache - needs to be read from disk
|
|
723
|
+
block_indices_to_read.emplace_back(i);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
// Step 2: Prepare IO requests for blocks not in cache
|
|
728
|
+
if (block_indices_to_read.empty()) {
|
|
729
|
+
// All blocks found in cache - count them as cache hits
|
|
730
|
+
rs->num_cache_hits_ = job->block_handles.size();
|
|
731
|
+
*read_set = std::move(rs);
|
|
732
|
+
return Status::OK();
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// Count cache hits (blocks that were found in cache during lookup above)
|
|
736
|
+
rs->num_cache_hits_ =
|
|
737
|
+
job->block_handles.size() - block_indices_to_read.size();
|
|
738
|
+
|
|
739
|
+
// Calculate block sizes for uncached blocks
|
|
740
|
+
for (const auto& idx : block_indices_to_read) {
|
|
741
|
+
size_t block_size =
|
|
742
|
+
BlockBasedTable::BlockSizeWithTrailer(job->block_handles[idx]);
|
|
743
|
+
rs->block_sizes_[idx] = block_size;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
// Store dispatcher reference for release callbacks
|
|
747
|
+
rs->dispatcher_data_ = shared_from_this();
|
|
748
|
+
|
|
749
|
+
// Pre-coalesce blocks into groups, respecting memory budget per group
|
|
750
|
+
// This ensures we dispatch meaningful IO sizes, not tiny single-block IOs
|
|
751
|
+
// Both memory-limited and non-memory-limited paths use the same coalescing
|
|
752
|
+
auto coalesced_groups = PreCoalesceBlocks(job, rs, block_indices_to_read,
|
|
753
|
+
max_prefetch_memory_bytes_);
|
|
754
|
+
|
|
755
|
+
std::vector<size_t> blocks_to_dispatch;
|
|
756
|
+
std::deque<CoalescedPrefetchGroup> groups_to_queue;
|
|
757
|
+
|
|
758
|
+
// Try to acquire memory for entire coalesced groups
|
|
759
|
+
for (auto& group : coalesced_groups) {
|
|
760
|
+
if (TryAcquireMemory(group.total_bytes)) {
|
|
761
|
+
// Add all blocks from this group to dispatch
|
|
762
|
+
for (size_t idx : group.block_indices) {
|
|
763
|
+
blocks_to_dispatch.push_back(idx);
|
|
764
|
+
}
|
|
765
|
+
} else {
|
|
766
|
+
// Queue this group for later
|
|
767
|
+
groups_to_queue.push_back(std::move(group));
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
// Dispatch acquired blocks immediately
|
|
772
|
+
if (!blocks_to_dispatch.empty()) {
|
|
773
|
+
DispatchPrefetch(rs, job, blocks_to_dispatch);
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
// Queue remaining groups for later (only applies when memory limiting)
|
|
777
|
+
if (!groups_to_queue.empty()) {
|
|
778
|
+
auto pending = std::make_shared<PendingPrefetchRequest>();
|
|
779
|
+
pending->read_set = rs;
|
|
780
|
+
pending->job = job;
|
|
781
|
+
|
|
782
|
+
size_t pending_bytes = 0;
|
|
783
|
+
for (const auto& group : groups_to_queue) {
|
|
784
|
+
for (size_t idx : group.block_indices) {
|
|
785
|
+
pending->block_indices_to_prefetch.insert(idx);
|
|
786
|
+
}
|
|
787
|
+
pending_bytes += group.total_bytes;
|
|
788
|
+
}
|
|
789
|
+
pending->coalesced_groups = std::move(groups_to_queue);
|
|
790
|
+
pending->pending_bytes_ = pending_bytes;
|
|
791
|
+
|
|
792
|
+
// Set up pending flags for queued blocks only
|
|
793
|
+
size_t num_blocks = job->block_handles.size();
|
|
794
|
+
rs->pending_prefetch_flags_ =
|
|
795
|
+
std::make_unique<std::atomic<bool>[]>(num_blocks);
|
|
796
|
+
rs->pending_prefetch_flags_size_ = num_blocks;
|
|
797
|
+
for (size_t idx : pending->block_indices_to_prefetch) {
|
|
798
|
+
rs->pending_prefetch_flags_[idx].store(true);
|
|
799
|
+
}
|
|
800
|
+
rs->pending_request_ = pending;
|
|
801
|
+
|
|
802
|
+
{
|
|
803
|
+
MutexLock lock(&memory_mutex_);
|
|
804
|
+
pending_prefetch_queue_.push_back(std::move(pending));
|
|
805
|
+
has_pending_requests_.store(true, std::memory_order_release);
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
*read_set = std::move(rs);
|
|
810
|
+
return Status::OK();
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
void IODispatcherImpl::Impl::PrepareIORequests(
|
|
814
|
+
const std::shared_ptr<IOJob>& job,
|
|
815
|
+
const std::vector<size_t>& block_indices_to_read,
|
|
816
|
+
const std::vector<BlockHandle>& block_handles,
|
|
817
|
+
std::vector<FSReadRequest>* read_reqs,
|
|
818
|
+
std::vector<std::vector<size_t>>* coalesced_block_indices) {
|
|
819
|
+
// This is necessary because block handles may not be in sorted order
|
|
820
|
+
std::vector<size_t> sorted_block_indices = block_indices_to_read;
|
|
821
|
+
std::sort(sorted_block_indices.begin(), sorted_block_indices.end(),
|
|
822
|
+
[&block_handles](size_t a, size_t b) {
|
|
823
|
+
return block_handles[a].offset() < block_handles[b].offset();
|
|
824
|
+
});
|
|
825
|
+
|
|
826
|
+
assert(coalesced_block_indices->empty());
|
|
827
|
+
coalesced_block_indices->resize(1);
|
|
828
|
+
|
|
829
|
+
for (const auto& block_idx : sorted_block_indices) {
|
|
830
|
+
if (!coalesced_block_indices->back().empty()) {
|
|
831
|
+
// Check if we can coalesce with previous block
|
|
832
|
+
const auto& last_block_handle =
|
|
833
|
+
block_handles[coalesced_block_indices->back().back()];
|
|
834
|
+
uint64_t last_block_end =
|
|
835
|
+
last_block_handle.offset() +
|
|
836
|
+
BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
|
|
837
|
+
uint64_t current_start = block_handles[block_idx].offset();
|
|
838
|
+
|
|
839
|
+
if (current_start >
|
|
840
|
+
last_block_end + job->job_options.io_coalesce_threshold) {
|
|
841
|
+
// Gap too large - start new IO request
|
|
842
|
+
coalesced_block_indices->emplace_back();
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
coalesced_block_indices->back().emplace_back(block_idx);
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// Create FSReadRequest for each coalesced group
|
|
849
|
+
assert(read_reqs->empty());
|
|
850
|
+
read_reqs->reserve(coalesced_block_indices->size());
|
|
851
|
+
|
|
852
|
+
for (const auto& block_indices : *coalesced_block_indices) {
|
|
853
|
+
assert(!block_indices.empty());
|
|
854
|
+
|
|
855
|
+
// Find the min and max offsets in this coalesced group
|
|
856
|
+
// Since blocks are now sorted, first has min offset and last has max
|
|
857
|
+
const auto& first_block_handle = block_handles[block_indices[0]];
|
|
858
|
+
const auto& last_block_handle = block_handles[block_indices.back()];
|
|
859
|
+
|
|
860
|
+
const auto start_offset = first_block_handle.offset();
|
|
861
|
+
const auto end_offset =
|
|
862
|
+
last_block_handle.offset() +
|
|
863
|
+
BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
|
|
864
|
+
|
|
865
|
+
assert(end_offset > start_offset);
|
|
866
|
+
|
|
867
|
+
read_reqs->emplace_back();
|
|
868
|
+
read_reqs->back().offset = start_offset;
|
|
869
|
+
read_reqs->back().len = end_offset - start_offset;
|
|
870
|
+
read_reqs->back().scratch = nullptr;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
std::vector<CoalescedPrefetchGroup> IODispatcherImpl::Impl::PreCoalesceBlocks(
|
|
875
|
+
const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
|
|
876
|
+
const std::vector<size_t>& block_indices, size_t max_group_bytes) {
|
|
877
|
+
std::vector<CoalescedPrefetchGroup> groups;
|
|
878
|
+
|
|
879
|
+
if (block_indices.empty()) {
|
|
880
|
+
return groups;
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
const auto& block_handles = job->block_handles;
|
|
884
|
+
const uint64_t coalesce_threshold = job->job_options.io_coalesce_threshold;
|
|
885
|
+
|
|
886
|
+
// Sort block indices by offset for coalescing
|
|
887
|
+
std::vector<size_t> sorted_indices = block_indices;
|
|
888
|
+
std::sort(sorted_indices.begin(), sorted_indices.end(),
|
|
889
|
+
[&block_handles](size_t a, size_t b) {
|
|
890
|
+
return block_handles[a].offset() < block_handles[b].offset();
|
|
891
|
+
});
|
|
892
|
+
|
|
893
|
+
// Build coalesced groups respecting max_group_bytes
|
|
894
|
+
groups.emplace_back();
|
|
895
|
+
|
|
896
|
+
for (size_t idx : sorted_indices) {
|
|
897
|
+
size_t block_size = rs->block_sizes_[idx];
|
|
898
|
+
|
|
899
|
+
// Skip blocks that are individually larger than the memory budget
|
|
900
|
+
// These will be read synchronously when needed (via ReadIndex fallback)
|
|
901
|
+
if (max_group_bytes > 0 && block_size > max_group_bytes) {
|
|
902
|
+
continue;
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
// Check if we need to start a new group
|
|
906
|
+
bool start_new_group = false;
|
|
907
|
+
|
|
908
|
+
if (!groups.back().block_indices.empty()) {
|
|
909
|
+
// Check gap with previous block
|
|
910
|
+
size_t last_idx = groups.back().block_indices.back();
|
|
911
|
+
const auto& last_handle = block_handles[last_idx];
|
|
912
|
+
uint64_t last_end = last_handle.offset() +
|
|
913
|
+
BlockBasedTable::BlockSizeWithTrailer(last_handle);
|
|
914
|
+
uint64_t current_start = block_handles[idx].offset();
|
|
915
|
+
|
|
916
|
+
if (current_start > last_end + coalesce_threshold) {
|
|
917
|
+
start_new_group = true; // Gap too large
|
|
918
|
+
} else if (max_group_bytes > 0 &&
|
|
919
|
+
groups.back().total_bytes + block_size > max_group_bytes) {
|
|
920
|
+
start_new_group = true; // Would exceed size limit
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
if (start_new_group) {
|
|
925
|
+
groups.emplace_back();
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
groups.back().block_indices.push_back(idx);
|
|
929
|
+
groups.back().total_bytes += block_size;
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
return groups;
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
std::vector<size_t> IODispatcherImpl::Impl::ExecuteAsyncIO(
|
|
936
|
+
const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
|
|
937
|
+
std::vector<FSReadRequest>& read_reqs,
|
|
938
|
+
const std::vector<std::vector<size_t>>& coalesced_block_indices,
|
|
939
|
+
Status* out_status) {
|
|
940
|
+
std::vector<size_t> fallback_block_indices;
|
|
941
|
+
*out_status = Status::OK();
|
|
942
|
+
|
|
943
|
+
// Get file and IO options
|
|
944
|
+
auto* rep = job->table->get_rep();
|
|
945
|
+
IOOptions io_opts;
|
|
946
|
+
Status s =
|
|
947
|
+
rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
|
|
948
|
+
if (!s.ok()) {
|
|
949
|
+
*out_status = s;
|
|
950
|
+
return fallback_block_indices;
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
const bool direct_io = rep->file->use_direct_io();
|
|
954
|
+
|
|
955
|
+
// Submit async read requests and store them in the ReadSet
|
|
956
|
+
for (size_t i = 0; i < read_reqs.size(); ++i) {
|
|
957
|
+
auto async_state = std::make_shared<AsyncIOState>();
|
|
958
|
+
|
|
959
|
+
async_state->offset = read_reqs[i].offset;
|
|
960
|
+
async_state->block_indices = coalesced_block_indices[i];
|
|
961
|
+
async_state->read_req = std::move(read_reqs[i]);
|
|
962
|
+
|
|
963
|
+
for (const auto idx : coalesced_block_indices[i]) {
|
|
964
|
+
async_state->blocks.emplace_back(job->block_handles[idx]);
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
if (direct_io) {
|
|
968
|
+
async_state->read_req.scratch = nullptr;
|
|
969
|
+
} else {
|
|
970
|
+
async_state->buf.reset(new char[async_state->read_req.len]);
|
|
971
|
+
async_state->read_req.scratch = async_state->buf.get();
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
// Callback for async read completion
|
|
975
|
+
// Store the result slice and status back into async_state so we can access
|
|
976
|
+
// them after Poll() completes.
|
|
977
|
+
auto cb = [](const FSReadRequest& req, void* cb_arg) {
|
|
978
|
+
auto* state = static_cast<AsyncIOState*>(cb_arg);
|
|
979
|
+
state->read_req.result = req.result;
|
|
980
|
+
state->read_req.status = req.status;
|
|
981
|
+
};
|
|
982
|
+
|
|
983
|
+
s = rep->file->ReadAsync(async_state->read_req, io_opts, cb,
|
|
984
|
+
async_state.get(), &async_state->io_handle,
|
|
985
|
+
&async_state->del_fn,
|
|
986
|
+
direct_io ? &async_state->aligned_buf : nullptr);
|
|
987
|
+
|
|
988
|
+
if (!s.ok()) {
|
|
989
|
+
// Actual error - surface to caller
|
|
990
|
+
*out_status = s;
|
|
991
|
+
return fallback_block_indices;
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
if (async_state->io_handle == nullptr) {
|
|
995
|
+
// Async IO not supported - add to fallback list for sync IO
|
|
996
|
+
for (const auto idx : coalesced_block_indices[i]) {
|
|
997
|
+
fallback_block_indices.push_back(idx);
|
|
998
|
+
}
|
|
999
|
+
continue;
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
// Add async state to map for all blocks in this request
|
|
1003
|
+
for (const auto idx : async_state->block_indices) {
|
|
1004
|
+
read_set->async_io_map_[idx] = async_state;
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
return fallback_block_indices;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
Status IODispatcherImpl::Impl::ExecuteSyncIO(
|
|
1012
|
+
const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
|
|
1013
|
+
std::vector<FSReadRequest>& read_reqs,
|
|
1014
|
+
const std::vector<std::vector<size_t>>& coalesced_block_indices) {
|
|
1015
|
+
// Get file and IO options
|
|
1016
|
+
auto* rep = job->table->get_rep();
|
|
1017
|
+
IOOptions io_opts;
|
|
1018
|
+
if (Status s =
|
|
1019
|
+
rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
|
|
1020
|
+
!s.ok()) {
|
|
1021
|
+
return s;
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
const bool direct_io = rep->file->use_direct_io();
|
|
1025
|
+
|
|
1026
|
+
// Setup scratch buffers for MultiRead
|
|
1027
|
+
std::unique_ptr<char[]> buf;
|
|
1028
|
+
|
|
1029
|
+
if (direct_io) {
|
|
1030
|
+
for (auto& read_req : read_reqs) {
|
|
1031
|
+
read_req.scratch = nullptr;
|
|
1032
|
+
}
|
|
1033
|
+
} else {
|
|
1034
|
+
// Allocate a single contiguous buffer for all requests
|
|
1035
|
+
size_t total_len = 0;
|
|
1036
|
+
for (const auto& req : read_reqs) {
|
|
1037
|
+
total_len += req.len;
|
|
1038
|
+
}
|
|
1039
|
+
buf.reset(new char[total_len]);
|
|
1040
|
+
size_t offset = 0;
|
|
1041
|
+
for (auto& read_req : read_reqs) {
|
|
1042
|
+
read_req.scratch = buf.get() + offset;
|
|
1043
|
+
offset += read_req.len;
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
// Execute MultiRead
|
|
1048
|
+
AlignedBuf aligned_buf;
|
|
1049
|
+
if (Status s =
|
|
1050
|
+
rep->file->MultiRead(io_opts, read_reqs.data(), read_reqs.size(),
|
|
1051
|
+
direct_io ? &aligned_buf : nullptr);
|
|
1052
|
+
!s.ok()) {
|
|
1053
|
+
return s;
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
for (const auto& rq : read_reqs) {
|
|
1057
|
+
if (!rq.status.ok()) {
|
|
1058
|
+
return rq.status;
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// Process all blocks from the MultiRead results
|
|
1063
|
+
for (size_t i = 0; i < coalesced_block_indices.size(); ++i) {
|
|
1064
|
+
const auto& read_req = read_reqs[i];
|
|
1065
|
+
for (const auto& block_idx : coalesced_block_indices[i]) {
|
|
1066
|
+
const auto& block_handle = job->block_handles[block_idx];
|
|
1067
|
+
|
|
1068
|
+
Status create_status = CreateAndPinBlockFromBuffer(
|
|
1069
|
+
job, block_handle, read_req.offset, read_req.result,
|
|
1070
|
+
read_set->pinned_blocks_[block_idx]);
|
|
1071
|
+
if (!create_status.ok()) {
|
|
1072
|
+
return create_status;
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
return Status::OK();
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
IODispatcherImpl::IODispatcherImpl()
|
|
1081
|
+
: impl_(std::make_shared<Impl>(IODispatcherOptions())) {}
|
|
1082
|
+
|
|
1083
|
+
IODispatcherImpl::IODispatcherImpl(const IODispatcherOptions& options)
|
|
1084
|
+
: impl_(std::make_shared<Impl>(options)) {}
|
|
1085
|
+
|
|
1086
|
+
IODispatcherImpl::~IODispatcherImpl() = default;
|
|
1087
|
+
|
|
1088
|
+
Status IODispatcherImpl::SubmitJob(const std::shared_ptr<IOJob>& job,
|
|
1089
|
+
std::shared_ptr<ReadSet>* read_set) {
|
|
1090
|
+
return impl_->SubmitJob(job, read_set);
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
IODispatcher* NewIODispatcher() { return new IODispatcherImpl(); }
|
|
1094
|
+
|
|
1095
|
+
IODispatcher* NewIODispatcher(const IODispatcherOptions& options) {
|
|
1096
|
+
return new IODispatcherImpl(options);
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
} // namespace ROCKSDB_NAMESPACE
|