@nxtedition/rocksdb 15.4.1 → 16.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +70 -23
- package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
- package/deps/rocksdb/rocksdb/BUCK +42 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
- package/deps/rocksdb/rocksdb/Makefile +59 -32
- package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
- package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
- package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
- package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
- package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
- package/deps/rocksdb/rocksdb/db/builder.h +7 -0
- package/deps/rocksdb/rocksdb/db/c.cc +373 -57
- package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
- package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
- package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
- package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
- package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
- package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
- package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
- package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
- package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
- package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
- package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
- package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
- package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
- package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
- package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
- package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
- package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
- package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
- package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
- package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
- package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
- package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
- package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
- package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
- package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
- package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
- package/deps/rocksdb/rocksdb/env/env.cc +1 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
- package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
- package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
- package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
- package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
- package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
- package/deps/rocksdb/rocksdb/folly.mk +22 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
- package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
- package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
- package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
- package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
- package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
- package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
- package/deps/rocksdb/rocksdb/options/options.cc +5 -1
- package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
- package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
- package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
- package/deps/rocksdb/rocksdb/port/lang.h +4 -0
- package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
- package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
- package/deps/rocksdb/rocksdb/src.mk +12 -0
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
- package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
- package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
- package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
- package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
- package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
- package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
- package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
- package/deps/rocksdb/rocksdb/table/format.cc +27 -15
- package/deps/rocksdb/rocksdb/table/format.h +41 -15
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
- package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
- package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
- package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
- package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
- package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
- package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
- package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
- package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
- package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
- package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
- package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
- package/deps/rocksdb/rocksdb/util/coding.h +14 -27
- package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
- package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
- package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
- package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
- package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
- package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
- package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
- package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
- package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
- package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
- package/deps/rocksdb/rocksdb/util/math.h +3 -1
- package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
- package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
- package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
- package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
- package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
- package/deps/rocksdb/rocksdb/util/status.cc +3 -1
- package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
- package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
- package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
- package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
- package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
- package/deps/rocksdb/rocksdb.gyp +7 -0
- package/index.js +70 -10
- package/iterator.js +25 -3
- package/max_rev_operator.h +9 -5
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
|
@@ -21,137 +21,14 @@
|
|
|
21
21
|
#include "port/stack_trace.h"
|
|
22
22
|
#include "rocksdb/comparator.h"
|
|
23
23
|
#include "table/block_based/block_prefix_index.h"
|
|
24
|
+
#include "table/block_based/block_util.h"
|
|
24
25
|
#include "table/block_based/data_block_footer.h"
|
|
25
26
|
#include "table/format.h"
|
|
26
27
|
#include "util/coding.h"
|
|
28
|
+
#include "util/math.h"
|
|
27
29
|
|
|
28
30
|
namespace ROCKSDB_NAMESPACE {
|
|
29
31
|
|
|
30
|
-
// Helper routine: decode the next block entry starting at "p",
|
|
31
|
-
// storing the number of shared key bytes, non_shared key bytes,
|
|
32
|
-
// and the length of the value in "*shared", "*non_shared", and
|
|
33
|
-
// "*value_length", respectively. Will not dereference past "limit".
|
|
34
|
-
//
|
|
35
|
-
// If any errors are detected, returns nullptr. Otherwise, returns a
|
|
36
|
-
// pointer to the key delta (just past the three decoded values).
|
|
37
|
-
struct DecodeEntry {
|
|
38
|
-
inline const char* operator()(const char* p, const char* limit,
|
|
39
|
-
uint32_t* shared, uint32_t* non_shared,
|
|
40
|
-
uint32_t* value_length) {
|
|
41
|
-
// We need 2 bytes for shared and non_shared size. We also need one more
|
|
42
|
-
// byte either for value size or the actual value in case of value delta
|
|
43
|
-
// encoding.
|
|
44
|
-
assert(limit - p >= 3);
|
|
45
|
-
*shared = reinterpret_cast<const unsigned char*>(p)[0];
|
|
46
|
-
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
|
|
47
|
-
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
|
|
48
|
-
if ((*shared | *non_shared | *value_length) < 128) {
|
|
49
|
-
// Fast path: all three values are encoded in one byte each
|
|
50
|
-
p += 3;
|
|
51
|
-
} else {
|
|
52
|
-
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
|
|
53
|
-
return nullptr;
|
|
54
|
-
}
|
|
55
|
-
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
|
|
56
|
-
return nullptr;
|
|
57
|
-
}
|
|
58
|
-
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
|
|
59
|
-
return nullptr;
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// Using an assert in place of "return null" since we should not pay the
|
|
64
|
-
// cost of checking for corruption on every single key decoding
|
|
65
|
-
assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
|
|
66
|
-
return p;
|
|
67
|
-
}
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
// Helper routine: similar to DecodeEntry but does not have assertions.
|
|
71
|
-
// Instead, returns nullptr so that caller can detect and report failure.
|
|
72
|
-
struct CheckAndDecodeEntry {
|
|
73
|
-
inline const char* operator()(const char* p, const char* limit,
|
|
74
|
-
uint32_t* shared, uint32_t* non_shared,
|
|
75
|
-
uint32_t* value_length) {
|
|
76
|
-
// We need 2 bytes for shared and non_shared size. We also need one more
|
|
77
|
-
// byte either for value size or the actual value in case of value delta
|
|
78
|
-
// encoding.
|
|
79
|
-
if (limit - p < 3) {
|
|
80
|
-
return nullptr;
|
|
81
|
-
}
|
|
82
|
-
*shared = reinterpret_cast<const unsigned char*>(p)[0];
|
|
83
|
-
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
|
|
84
|
-
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
|
|
85
|
-
if ((*shared | *non_shared | *value_length) < 128) {
|
|
86
|
-
// Fast path: all three values are encoded in one byte each
|
|
87
|
-
p += 3;
|
|
88
|
-
} else {
|
|
89
|
-
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
|
|
90
|
-
return nullptr;
|
|
91
|
-
}
|
|
92
|
-
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
|
|
93
|
-
return nullptr;
|
|
94
|
-
}
|
|
95
|
-
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
|
|
96
|
-
return nullptr;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
|
|
101
|
-
return nullptr;
|
|
102
|
-
}
|
|
103
|
-
return p;
|
|
104
|
-
}
|
|
105
|
-
};
|
|
106
|
-
|
|
107
|
-
struct DecodeKey {
|
|
108
|
-
inline const char* operator()(const char* p, const char* limit,
|
|
109
|
-
uint32_t* shared, uint32_t* non_shared) {
|
|
110
|
-
uint32_t value_length;
|
|
111
|
-
return DecodeEntry()(p, limit, shared, non_shared, &value_length);
|
|
112
|
-
}
|
|
113
|
-
};
|
|
114
|
-
|
|
115
|
-
// In format_version 4, which is used by index blocks, the value size is not
|
|
116
|
-
// encoded before the entry, as the value is known to be the handle with the
|
|
117
|
-
// known size.
|
|
118
|
-
struct DecodeKeyV4 {
|
|
119
|
-
inline const char* operator()(const char* p, const char* limit,
|
|
120
|
-
uint32_t* shared, uint32_t* non_shared) {
|
|
121
|
-
// We need 2 bytes for shared and non_shared size. We also need one more
|
|
122
|
-
// byte either for value size or the actual value in case of value delta
|
|
123
|
-
// encoding.
|
|
124
|
-
if (limit - p < 3) {
|
|
125
|
-
return nullptr;
|
|
126
|
-
}
|
|
127
|
-
*shared = reinterpret_cast<const unsigned char*>(p)[0];
|
|
128
|
-
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
|
|
129
|
-
if ((*shared | *non_shared) < 128) {
|
|
130
|
-
// Fast path: all three values are encoded in one byte each
|
|
131
|
-
p += 2;
|
|
132
|
-
} else {
|
|
133
|
-
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
|
|
134
|
-
return nullptr;
|
|
135
|
-
}
|
|
136
|
-
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
|
|
137
|
-
return nullptr;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
return p;
|
|
141
|
-
}
|
|
142
|
-
};
|
|
143
|
-
|
|
144
|
-
struct DecodeEntryV4 {
|
|
145
|
-
inline const char* operator()(const char* p, const char* limit,
|
|
146
|
-
uint32_t* shared, uint32_t* non_shared,
|
|
147
|
-
uint32_t* value_length) {
|
|
148
|
-
assert(value_length);
|
|
149
|
-
|
|
150
|
-
*value_length = 0;
|
|
151
|
-
return DecodeKeyV4()(p, limit, shared, non_shared);
|
|
152
|
-
}
|
|
153
|
-
};
|
|
154
|
-
|
|
155
32
|
void DataBlockIter::NextImpl() {
|
|
156
33
|
#ifndef NDEBUG
|
|
157
34
|
if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
|
|
@@ -160,28 +37,24 @@ void DataBlockIter::NextImpl() {
|
|
|
160
37
|
#endif
|
|
161
38
|
bool is_shared = false;
|
|
162
39
|
ParseNextDataKey(&is_shared);
|
|
163
|
-
++cur_entry_idx_;
|
|
164
40
|
}
|
|
165
41
|
|
|
166
42
|
void MetaBlockIter::NextImpl() {
|
|
167
43
|
bool is_shared = false;
|
|
168
|
-
ParseNextKey<
|
|
169
|
-
++cur_entry_idx_;
|
|
44
|
+
ParseNextKey<DecodeEntry, true>(&is_shared);
|
|
170
45
|
}
|
|
171
46
|
|
|
172
|
-
void IndexBlockIter::NextImpl() {
|
|
173
|
-
ParseNextIndexKey();
|
|
174
|
-
++cur_entry_idx_;
|
|
175
|
-
}
|
|
47
|
+
void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
|
|
176
48
|
|
|
177
49
|
void IndexBlockIter::PrevImpl() {
|
|
178
50
|
assert(Valid());
|
|
179
51
|
// Scan backwards to a restart point before current_
|
|
180
52
|
const uint32_t original = current_;
|
|
53
|
+
const auto prev_entry_idx = cur_entry_idx_ - 1;
|
|
181
54
|
while (GetRestartPoint(restart_index_) >= original) {
|
|
182
55
|
if (restart_index_ == 0) {
|
|
183
56
|
// No more entries
|
|
184
|
-
current_ =
|
|
57
|
+
current_ = GetKeysEndOffset();
|
|
185
58
|
restart_index_ = num_restarts_;
|
|
186
59
|
return;
|
|
187
60
|
}
|
|
@@ -191,17 +64,18 @@ void IndexBlockIter::PrevImpl() {
|
|
|
191
64
|
// Loop until end of current entry hits the start of original entry
|
|
192
65
|
while (ParseNextIndexKey() && NextEntryOffset() < original) {
|
|
193
66
|
}
|
|
194
|
-
|
|
67
|
+
cur_entry_idx_ = prev_entry_idx;
|
|
195
68
|
}
|
|
196
69
|
|
|
197
70
|
void MetaBlockIter::PrevImpl() {
|
|
198
71
|
assert(Valid());
|
|
199
72
|
// Scan backwards to a restart point before current_
|
|
200
73
|
const uint32_t original = current_;
|
|
74
|
+
const auto prev_entry_idx = cur_entry_idx_ - 1;
|
|
201
75
|
while (GetRestartPoint(restart_index_) >= original) {
|
|
202
76
|
if (restart_index_ == 0) {
|
|
203
77
|
// No more entries
|
|
204
|
-
current_ =
|
|
78
|
+
current_ = GetKeysEndOffset();
|
|
205
79
|
restart_index_ = num_restarts_;
|
|
206
80
|
return;
|
|
207
81
|
}
|
|
@@ -210,19 +84,19 @@ void MetaBlockIter::PrevImpl() {
|
|
|
210
84
|
SeekToRestartPoint(restart_index_);
|
|
211
85
|
bool is_shared = false;
|
|
212
86
|
// Loop until end of current entry hits the start of original entry
|
|
213
|
-
while (ParseNextKey<
|
|
87
|
+
while (ParseNextKey<DecodeEntry, true>(&is_shared) &&
|
|
214
88
|
NextEntryOffset() < original) {
|
|
215
89
|
}
|
|
216
|
-
|
|
90
|
+
cur_entry_idx_ = prev_entry_idx;
|
|
217
91
|
}
|
|
218
92
|
|
|
219
93
|
// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
|
|
220
94
|
void DataBlockIter::PrevImpl() {
|
|
221
95
|
assert(Valid());
|
|
222
96
|
|
|
97
|
+
const auto prev_entry_idx = cur_entry_idx_ - 1;
|
|
223
98
|
assert(prev_entries_idx_ == -1 ||
|
|
224
99
|
static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
|
|
225
|
-
--cur_entry_idx_;
|
|
226
100
|
// Check if we can use cached prev_entries_
|
|
227
101
|
if (prev_entries_idx_ > 0 &&
|
|
228
102
|
prev_entries_[prev_entries_idx_].offset == current_) {
|
|
@@ -252,7 +126,9 @@ void DataBlockIter::PrevImpl() {
|
|
|
252
126
|
// (i.e., keys in it are not actually pinned).
|
|
253
127
|
raw_key_.SetKey(current_key, raw_key_cached /* copy */);
|
|
254
128
|
value_ = current_prev_entry.value;
|
|
255
|
-
|
|
129
|
+
// Set entry_ using stored entry_size for NextEntryOffset() to work
|
|
130
|
+
entry_ = Slice(data_ + current_, current_prev_entry.entry_size);
|
|
131
|
+
cur_entry_idx_ = prev_entry_idx;
|
|
256
132
|
return;
|
|
257
133
|
}
|
|
258
134
|
|
|
@@ -266,15 +142,15 @@ void DataBlockIter::PrevImpl() {
|
|
|
266
142
|
while (GetRestartPoint(restart_index_) >= original) {
|
|
267
143
|
if (restart_index_ == 0) {
|
|
268
144
|
// No more entries
|
|
269
|
-
current_ =
|
|
145
|
+
current_ = GetKeysEndOffset();
|
|
270
146
|
restart_index_ = num_restarts_;
|
|
147
|
+
cur_entry_idx_ = prev_entry_idx;
|
|
271
148
|
return;
|
|
272
149
|
}
|
|
273
150
|
restart_index_--;
|
|
274
151
|
}
|
|
275
152
|
|
|
276
153
|
SeekToRestartPoint(restart_index_);
|
|
277
|
-
|
|
278
154
|
do {
|
|
279
155
|
bool is_shared = false;
|
|
280
156
|
if (!ParseNextDataKey(&is_shared)) {
|
|
@@ -284,19 +160,22 @@ void DataBlockIter::PrevImpl() {
|
|
|
284
160
|
|
|
285
161
|
if (raw_key_.IsKeyPinned()) {
|
|
286
162
|
// The key is not delta encoded
|
|
287
|
-
prev_entries_.emplace_back(current_,
|
|
288
|
-
current_key.
|
|
163
|
+
prev_entries_.emplace_back(current_, static_cast<uint32_t>(entry_.size()),
|
|
164
|
+
current_key.data(), 0, current_key.size(),
|
|
165
|
+
value());
|
|
289
166
|
} else {
|
|
290
167
|
// The key is delta encoded, cache decoded key in buffer
|
|
291
168
|
size_t new_key_offset = prev_entries_keys_buff_.size();
|
|
292
169
|
prev_entries_keys_buff_.append(current_key.data(), current_key.size());
|
|
293
170
|
|
|
294
|
-
prev_entries_.emplace_back(current_,
|
|
295
|
-
current_key.size(),
|
|
171
|
+
prev_entries_.emplace_back(current_, static_cast<uint32_t>(entry_.size()),
|
|
172
|
+
nullptr, new_key_offset, current_key.size(),
|
|
173
|
+
value());
|
|
296
174
|
}
|
|
297
175
|
// Loop until end of current entry hits the start of original entry
|
|
298
176
|
} while (NextEntryOffset() < original);
|
|
299
177
|
prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
|
|
178
|
+
cur_entry_idx_ = prev_entry_idx;
|
|
300
179
|
}
|
|
301
180
|
|
|
302
181
|
void DataBlockIter::SeekImpl(const Slice& target) {
|
|
@@ -307,7 +186,8 @@ void DataBlockIter::SeekImpl(const Slice& target) {
|
|
|
307
186
|
}
|
|
308
187
|
uint32_t index = 0;
|
|
309
188
|
bool skip_linear_scan = false;
|
|
310
|
-
bool ok =
|
|
189
|
+
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
190
|
+
&skip_linear_scan);
|
|
311
191
|
|
|
312
192
|
if (!ok) {
|
|
313
193
|
return;
|
|
@@ -323,7 +203,8 @@ void MetaBlockIter::SeekImpl(const Slice& target) {
|
|
|
323
203
|
}
|
|
324
204
|
uint32_t index = 0;
|
|
325
205
|
bool skip_linear_scan = false;
|
|
326
|
-
bool ok =
|
|
206
|
+
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
207
|
+
&skip_linear_scan);
|
|
327
208
|
|
|
328
209
|
if (!ok) {
|
|
329
210
|
return;
|
|
@@ -393,15 +274,12 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
|
|
|
393
274
|
assert(restart_index < num_restarts_);
|
|
394
275
|
SeekToRestartPoint(restart_index);
|
|
395
276
|
current_ = GetRestartPoint(restart_index);
|
|
396
|
-
cur_entry_idx_ =
|
|
397
|
-
static_cast<int32_t>(restart_index * block_restart_interval_) - 1;
|
|
398
277
|
|
|
399
|
-
uint32_t limit =
|
|
278
|
+
uint32_t limit = GetKeysEndOffset();
|
|
400
279
|
if (restart_index + 1 < num_restarts_) {
|
|
401
280
|
limit = GetRestartPoint(restart_index + 1);
|
|
402
281
|
}
|
|
403
282
|
while (current_ < limit) {
|
|
404
|
-
++cur_entry_idx_;
|
|
405
283
|
bool shared;
|
|
406
284
|
// Here we only linear seek the target key inside the restart interval.
|
|
407
285
|
// If a key does not exist inside a restart interval, we avoid
|
|
@@ -440,8 +318,8 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
|
|
|
440
318
|
return true;
|
|
441
319
|
}
|
|
442
320
|
|
|
443
|
-
if (icmp_
|
|
444
|
-
|
|
321
|
+
if (icmp_.user_comparator()->Compare(raw_key_.GetUserKey(),
|
|
322
|
+
target_user_key) != 0) {
|
|
445
323
|
// the key is not in this block and cannot be at the next block either.
|
|
446
324
|
return false;
|
|
447
325
|
}
|
|
@@ -488,16 +366,20 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
|
|
|
488
366
|
// This is to let the caller to distinguish between non-existing prefix,
|
|
489
367
|
// and when key is larger than the last key, which both set Valid() to
|
|
490
368
|
// false.
|
|
491
|
-
current_ =
|
|
369
|
+
current_ = GetKeysEndOffset();
|
|
492
370
|
status_ = Status::NotFound();
|
|
493
371
|
}
|
|
494
372
|
// restart interval must be one when hash search is enabled so the binary
|
|
495
373
|
// search simply lands at the right place.
|
|
496
374
|
skip_linear_scan = true;
|
|
497
|
-
} else if (value_delta_encoded_) {
|
|
498
|
-
ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
|
|
499
375
|
} else {
|
|
500
|
-
|
|
376
|
+
if (value_delta_encoded_) {
|
|
377
|
+
ok = FindRestartPointForSeek<DecodeKeyV4>(seek_key, &index,
|
|
378
|
+
&skip_linear_scan);
|
|
379
|
+
} else {
|
|
380
|
+
ok = FindRestartPointForSeek<DecodeKey>(seek_key, &index,
|
|
381
|
+
&skip_linear_scan);
|
|
382
|
+
}
|
|
501
383
|
}
|
|
502
384
|
|
|
503
385
|
if (!ok) {
|
|
@@ -506,6 +388,18 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
|
|
|
506
388
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
507
389
|
}
|
|
508
390
|
|
|
391
|
+
template <typename DecodeKeyFunc>
|
|
392
|
+
bool IndexBlockIter::FindRestartPointForSeek(const Slice& seek_key,
|
|
393
|
+
uint32_t* index,
|
|
394
|
+
bool* skip_linear_scan) {
|
|
395
|
+
if (index_search_type_ == BlockBasedTableOptions::kBinary) {
|
|
396
|
+
return BinarySeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
|
|
397
|
+
skip_linear_scan);
|
|
398
|
+
}
|
|
399
|
+
return InterpolationSeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
|
|
400
|
+
skip_linear_scan);
|
|
401
|
+
}
|
|
402
|
+
|
|
509
403
|
void DataBlockIter::SeekForPrevImpl(const Slice& target) {
|
|
510
404
|
PERF_TIMER_GUARD(block_seek_nanos);
|
|
511
405
|
Slice seek_key = target;
|
|
@@ -514,13 +408,13 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) {
|
|
|
514
408
|
}
|
|
515
409
|
uint32_t index = 0;
|
|
516
410
|
bool skip_linear_scan = false;
|
|
517
|
-
bool ok =
|
|
411
|
+
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
412
|
+
&skip_linear_scan);
|
|
518
413
|
|
|
519
414
|
if (!ok) {
|
|
520
415
|
return;
|
|
521
416
|
}
|
|
522
417
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
523
|
-
|
|
524
418
|
if (!Valid()) {
|
|
525
419
|
if (status_.ok()) {
|
|
526
420
|
SeekToLastImpl();
|
|
@@ -540,7 +434,8 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
|
|
|
540
434
|
}
|
|
541
435
|
uint32_t index = 0;
|
|
542
436
|
bool skip_linear_scan = false;
|
|
543
|
-
bool ok =
|
|
437
|
+
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
438
|
+
&skip_linear_scan);
|
|
544
439
|
|
|
545
440
|
if (!ok) {
|
|
546
441
|
return;
|
|
@@ -565,7 +460,6 @@ void DataBlockIter::SeekToFirstImpl() {
|
|
|
565
460
|
SeekToRestartPoint(0);
|
|
566
461
|
bool is_shared = false;
|
|
567
462
|
ParseNextDataKey(&is_shared);
|
|
568
|
-
cur_entry_idx_ = 0;
|
|
569
463
|
}
|
|
570
464
|
|
|
571
465
|
void MetaBlockIter::SeekToFirstImpl() {
|
|
@@ -574,8 +468,7 @@ void MetaBlockIter::SeekToFirstImpl() {
|
|
|
574
468
|
}
|
|
575
469
|
SeekToRestartPoint(0);
|
|
576
470
|
bool is_shared = false;
|
|
577
|
-
ParseNextKey<
|
|
578
|
-
cur_entry_idx_ = 0;
|
|
471
|
+
ParseNextKey<DecodeEntry, true>(&is_shared);
|
|
579
472
|
}
|
|
580
473
|
|
|
581
474
|
void IndexBlockIter::SeekToFirstImpl() {
|
|
@@ -590,7 +483,6 @@ void IndexBlockIter::SeekToFirstImpl() {
|
|
|
590
483
|
status_ = Status::OK();
|
|
591
484
|
SeekToRestartPoint(0);
|
|
592
485
|
ParseNextIndexKey();
|
|
593
|
-
cur_entry_idx_ = 0;
|
|
594
486
|
}
|
|
595
487
|
|
|
596
488
|
void DataBlockIter::SeekToLastImpl() {
|
|
@@ -599,10 +491,10 @@ void DataBlockIter::SeekToLastImpl() {
|
|
|
599
491
|
}
|
|
600
492
|
SeekToRestartPoint(num_restarts_ - 1);
|
|
601
493
|
bool is_shared = false;
|
|
602
|
-
|
|
603
|
-
while (ParseNextDataKey(&is_shared) &&
|
|
494
|
+
|
|
495
|
+
while (ParseNextDataKey(&is_shared) &&
|
|
496
|
+
NextEntryOffset() < GetKeysEndOffset()) {
|
|
604
497
|
// Keep skipping
|
|
605
|
-
++cur_entry_idx_;
|
|
606
498
|
}
|
|
607
499
|
}
|
|
608
500
|
|
|
@@ -613,12 +505,9 @@ void MetaBlockIter::SeekToLastImpl() {
|
|
|
613
505
|
SeekToRestartPoint(num_restarts_ - 1);
|
|
614
506
|
bool is_shared = false;
|
|
615
507
|
assert(num_restarts_ >= 1);
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
|
|
619
|
-
NextEntryOffset() < restarts_) {
|
|
508
|
+
while (ParseNextKey<DecodeEntry, true>(&is_shared) &&
|
|
509
|
+
NextEntryOffset() < GetKeysEndOffset()) {
|
|
620
510
|
// Will probably never reach here since restart_interval is always 1
|
|
621
|
-
++cur_entry_idx_;
|
|
622
511
|
}
|
|
623
512
|
}
|
|
624
513
|
|
|
@@ -628,32 +517,54 @@ void IndexBlockIter::SeekToLastImpl() {
|
|
|
628
517
|
}
|
|
629
518
|
status_ = Status::OK();
|
|
630
519
|
SeekToRestartPoint(num_restarts_ - 1);
|
|
631
|
-
|
|
632
|
-
while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
|
|
633
|
-
++cur_entry_idx_;
|
|
520
|
+
while (ParseNextIndexKey() && NextEntryOffset() < GetKeysEndOffset()) {
|
|
634
521
|
}
|
|
635
522
|
}
|
|
636
523
|
|
|
637
524
|
template <class TValue>
|
|
638
|
-
template <typename DecodeEntryFunc>
|
|
525
|
+
template <typename DecodeEntryFunc, bool StrictCheck>
|
|
639
526
|
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
|
|
640
527
|
current_ = NextEntryOffset();
|
|
528
|
+
++cur_entry_idx_;
|
|
641
529
|
const char* p = data_ + current_;
|
|
642
|
-
const char*
|
|
530
|
+
const char* key_limit = data_ + GetKeysEndOffset();
|
|
643
531
|
|
|
644
|
-
if (p >=
|
|
532
|
+
if (p >= key_limit) {
|
|
645
533
|
// No more entries to return. Mark as invalid.
|
|
646
|
-
current_ =
|
|
534
|
+
current_ = GetKeysEndOffset();
|
|
647
535
|
restart_index_ = num_restarts_;
|
|
648
536
|
return false;
|
|
649
537
|
}
|
|
538
|
+
|
|
650
539
|
// Decode next entry
|
|
651
540
|
uint32_t shared, non_shared, value_length;
|
|
652
|
-
|
|
541
|
+
uint32_t value_offset = 0;
|
|
542
|
+
|
|
543
|
+
assert(cur_entry_idx_ >= 0);
|
|
544
|
+
assert(values_section_ == nullptr || block_restart_interval_ > 0);
|
|
545
|
+
bool value_offset_encoded =
|
|
546
|
+
values_section_ && cur_entry_idx_ % block_restart_interval_ == 0;
|
|
547
|
+
|
|
548
|
+
auto p_old = p;
|
|
549
|
+
p = DecodeEntryFunc()(p, key_limit, &shared, &non_shared, &value_length,
|
|
550
|
+
value_offset_encoded ? &value_offset : nullptr);
|
|
551
|
+
|
|
653
552
|
if (p == nullptr || raw_key_.Size() < shared) {
|
|
654
553
|
CorruptionError();
|
|
655
554
|
return false;
|
|
656
555
|
} else {
|
|
556
|
+
if constexpr (StrictCheck) {
|
|
557
|
+
auto entry_length =
|
|
558
|
+
non_shared + (values_section_ == nullptr ? value_length : 0);
|
|
559
|
+
if (static_cast<uint32_t>(key_limit - p) < entry_length) {
|
|
560
|
+
CorruptionError();
|
|
561
|
+
return false;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
assert(values_section_ == nullptr ||
|
|
566
|
+
cur_entry_idx_ % block_restart_interval_ != 0 || shared == 0);
|
|
567
|
+
entry_ = Slice(p_old, p - p_old + non_shared);
|
|
657
568
|
if (shared == 0) {
|
|
658
569
|
*is_shared = false;
|
|
659
570
|
// If this key doesn't share any bytes with prev key, and no min timestamp
|
|
@@ -673,15 +584,36 @@ bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
|
|
|
673
584
|
raw_key_.TrimAppend(shared, p, non_shared);
|
|
674
585
|
}
|
|
675
586
|
}
|
|
676
|
-
|
|
587
|
+
|
|
677
588
|
if (shared == 0) {
|
|
678
589
|
while (restart_index_ + 1 < num_restarts_ &&
|
|
679
590
|
GetRestartPoint(restart_index_ + 1) < current_) {
|
|
680
591
|
++restart_index_;
|
|
681
592
|
}
|
|
682
593
|
}
|
|
683
|
-
|
|
684
|
-
|
|
594
|
+
|
|
595
|
+
if (values_section_) {
|
|
596
|
+
if (value_offset_encoded) {
|
|
597
|
+
// Restart point, derive from offset
|
|
598
|
+
value_ = Slice(values_section_ + value_offset, value_length);
|
|
599
|
+
} else {
|
|
600
|
+
// Non-restart point, derive from previous value
|
|
601
|
+
assert(value_.data() >= values_section_);
|
|
602
|
+
value_ = Slice(value_.data() + value_.size(), value_length);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
if constexpr (StrictCheck) {
|
|
606
|
+
if ((value_.data() + value_.size()) > data_ + restarts_) {
|
|
607
|
+
CorruptionError();
|
|
608
|
+
return false;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
} else {
|
|
612
|
+
value_ = Slice(entry_.data() + entry_.size(), value_length);
|
|
613
|
+
// extend entry slice to contain value as well
|
|
614
|
+
entry_ = Slice(entry_.data(), entry_.size() + value_.size());
|
|
615
|
+
}
|
|
616
|
+
assert((value_.data() + value_.size()) <= data_ + restarts_);
|
|
685
617
|
return true;
|
|
686
618
|
}
|
|
687
619
|
}
|
|
@@ -741,11 +673,17 @@ bool IndexBlockIter::ParseNextIndexKey() {
|
|
|
741
673
|
void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
|
|
742
674
|
Slice v(value_.data(), data_ + restarts_ - value_.data());
|
|
743
675
|
// Delta encoding is used if `shared` != 0.
|
|
676
|
+
assert(!value_delta_encoded_ || value_.size() == 0);
|
|
744
677
|
Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
|
|
745
678
|
&v, have_first_key_,
|
|
746
679
|
(value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
|
|
747
680
|
assert(decode_s.ok());
|
|
748
681
|
value_ = Slice(value_.data(), v.data() - value_.data());
|
|
682
|
+
if (!values_section_ && value_delta_encoded_) {
|
|
683
|
+
assert(entry_.data() + entry_.size() == value_.data());
|
|
684
|
+
// values are inlined in the entry, so need to set next offset accordingly
|
|
685
|
+
entry_ = Slice(entry_.data(), entry_.size() + value_.size());
|
|
686
|
+
}
|
|
749
687
|
|
|
750
688
|
if (global_seqno_state_ != nullptr) {
|
|
751
689
|
// Overwrite sequence number the same way as in DataBlockIter.
|
|
@@ -783,8 +721,8 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
|
|
|
783
721
|
// to follow it up with NextImpl() to position the iterator at the restart
|
|
784
722
|
// key.
|
|
785
723
|
SeekToRestartPoint(index);
|
|
786
|
-
cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
|
|
787
724
|
NextImpl();
|
|
725
|
+
assert(cur_entry_idx_ >= 0);
|
|
788
726
|
|
|
789
727
|
if (!skip_linear_scan) {
|
|
790
728
|
// Linear search (within restart block) for first key >= target
|
|
@@ -816,9 +754,28 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
|
|
|
816
754
|
}
|
|
817
755
|
}
|
|
818
756
|
|
|
819
|
-
//
|
|
820
|
-
|
|
821
|
-
|
|
757
|
+
// Get the key slice at a given restart point index.
|
|
758
|
+
template <class TValue>
|
|
759
|
+
template <typename DecodeKeyFunc>
|
|
760
|
+
bool BlockIter<TValue>::GetRestartKey(uint32_t index, Slice* key) {
|
|
761
|
+
uint32_t region_offset = GetRestartPoint(index);
|
|
762
|
+
uint32_t shared, non_shared, value_offset;
|
|
763
|
+
const char* key_ptr =
|
|
764
|
+
DecodeKeyFunc()(data_ + region_offset, data_ + restarts_, &shared,
|
|
765
|
+
&non_shared, values_section_ ? &value_offset : nullptr);
|
|
766
|
+
if (key_ptr == nullptr || (shared != 0)) {
|
|
767
|
+
CorruptionError();
|
|
768
|
+
return false;
|
|
769
|
+
}
|
|
770
|
+
*key = Slice(key_ptr, non_shared);
|
|
771
|
+
return true;
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// Searches in restart array using binary search to find the starting restart
|
|
775
|
+
// point for the linear scan, and stores it in `*index`. Assumes restart array
|
|
776
|
+
// does not contain duplicate keys.
|
|
777
|
+
//
|
|
778
|
+
// It is guaranteed that the restart key at `*index + 1`
|
|
822
779
|
// is strictly greater than `target` or does not exist (this can be used to
|
|
823
780
|
// elide a comparison when linear scan reaches all the way to the next restart
|
|
824
781
|
// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
|
|
@@ -826,15 +783,15 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
|
|
|
826
783
|
// compared again later.
|
|
827
784
|
template <class TValue>
|
|
828
785
|
template <typename DecodeKeyFunc>
|
|
829
|
-
bool BlockIter<TValue>::
|
|
830
|
-
|
|
786
|
+
bool BlockIter<TValue>::BinarySeekRestartPointIndex(const Slice& target,
|
|
787
|
+
uint32_t* index,
|
|
788
|
+
bool* skip_linear_scan) {
|
|
831
789
|
if (restarts_ == 0) {
|
|
832
790
|
// SST files dedicated to range tombstones are written with index blocks
|
|
833
791
|
// that have no keys while also having `num_restarts_ == 1`. This would
|
|
834
|
-
// cause a problem
|
|
835
|
-
//
|
|
836
|
-
//
|
|
837
|
-
// key accesses.
|
|
792
|
+
// cause a problem as we'd try to access the first key which does not exist.
|
|
793
|
+
// We identify such blocks by the offset at which their restarts are stored,
|
|
794
|
+
// and return false to prevent any attempted key accesses.
|
|
838
795
|
return false;
|
|
839
796
|
}
|
|
840
797
|
|
|
@@ -842,23 +799,24 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
|
|
|
842
799
|
// Loop invariants:
|
|
843
800
|
// - Restart key at index `left` is less than or equal to the target key. The
|
|
844
801
|
// sentinel index `-1` is considered to have a key that is less than all
|
|
845
|
-
// keys.
|
|
802
|
+
// keys. Doing this allows us to avoid a bounds check on left.
|
|
846
803
|
// - Any restart keys after index `right` are strictly greater than the target
|
|
847
804
|
// key.
|
|
848
|
-
int64_t left = -1
|
|
805
|
+
int64_t left = -1;
|
|
806
|
+
int64_t right = num_restarts_ - 1;
|
|
807
|
+
|
|
849
808
|
while (left != right) {
|
|
850
809
|
// The `mid` is computed by rounding up so it lands in (`left`, `right`].
|
|
851
810
|
int64_t mid = left + (right - left + 1) / 2;
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
if (key_ptr == nullptr || (shared != 0)) {
|
|
857
|
-
CorruptionError();
|
|
811
|
+
assert(left < mid && mid <= right);
|
|
812
|
+
|
|
813
|
+
Slice mid_key;
|
|
814
|
+
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
|
|
858
815
|
return false;
|
|
859
816
|
}
|
|
860
|
-
|
|
817
|
+
|
|
861
818
|
UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
|
|
819
|
+
|
|
862
820
|
int cmp = CompareCurrentKey(target);
|
|
863
821
|
if (cmp < 0) {
|
|
864
822
|
// Key at "mid" is smaller than "target". Therefore all
|
|
@@ -885,22 +843,317 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
|
|
|
885
843
|
return true;
|
|
886
844
|
}
|
|
887
845
|
|
|
846
|
+
// Similar effects to BinarySeekRestartPointIndex, except it uses a different
|
|
847
|
+
// algorithm to search for the restart point index (i.e. interpolation search).
|
|
848
|
+
// Interpolation search is typically more efficient for uniformly distributed
|
|
849
|
+
// datasets.
|
|
850
|
+
//
|
|
851
|
+
// Typically, interpolation search requires an integer "value". But because we
|
|
852
|
+
// are searching through variable length binary slices, we must estimate an
|
|
853
|
+
// integer value for each key. Currently, the value is set to be the first 8
|
|
854
|
+
// bytes (read big-endian) that do not share a prefix with the start and end
|
|
855
|
+
// key. As a side effect, this can really only be used with the
|
|
856
|
+
// BytewiseComparator().
|
|
857
|
+
template <class TValue>
|
|
858
|
+
template <typename DecodeKeyFunc>
|
|
859
|
+
bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
|
|
860
|
+
const Slice& target, uint32_t* index, bool* skip_linear_scan) {
|
|
861
|
+
static constexpr int64_t kGuardLen = 8;
|
|
862
|
+
static constexpr uint64_t kMaxPoorSearches = 8;
|
|
863
|
+
|
|
864
|
+
if (restarts_ == 0) {
|
|
865
|
+
return false;
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
*skip_linear_scan = false;
|
|
869
|
+
// Currently it is assumed that comparator is always bytewise comparator, but
|
|
870
|
+
// it may also be useful to to generalize to reverse bytewise in the future.
|
|
871
|
+
assert(icmp_.user_comparator() == BytewiseComparator());
|
|
872
|
+
|
|
873
|
+
int64_t left = -1;
|
|
874
|
+
int64_t right = num_restarts_ - 1;
|
|
875
|
+
size_t shared_user_prefix_len = 0;
|
|
876
|
+
|
|
877
|
+
Slice left_key;
|
|
878
|
+
Slice right_key;
|
|
879
|
+
Slice left_key_suffix;
|
|
880
|
+
Slice right_key_suffix;
|
|
881
|
+
Slice target_suffix = target;
|
|
882
|
+
bool seek_failed = false;
|
|
883
|
+
bool first_iter = true;
|
|
884
|
+
uint64_t left_val = 0;
|
|
885
|
+
uint64_t right_val = 0;
|
|
886
|
+
uint64_t target_val = 0;
|
|
887
|
+
|
|
888
|
+
// A poor search is when less than half the search space is reduced, because
|
|
889
|
+
// binary search would do better. When there are kMaxPoorSearches in a row,
|
|
890
|
+
// then fallback to binary search. This helps bound worse cast performance.
|
|
891
|
+
uint64_t continuous_poor_searches = 0;
|
|
892
|
+
|
|
893
|
+
// Loop invariants while not first iteration AND seek has not failed:
|
|
894
|
+
// - arr[usable_left] = left_key, arr[right] = right_key
|
|
895
|
+
// - left < mid <= right, and arr[left] < target < arr[right + 1]
|
|
896
|
+
//
|
|
897
|
+
// The first iteration is used as an early optimization to determine initial
|
|
898
|
+
// bounds, and whether target is within those bounds.
|
|
899
|
+
const bool is_user_key = raw_key_.IsUserKey();
|
|
900
|
+
const Slice target_user_key = is_user_key ? target : ExtractUserKey(target);
|
|
901
|
+
while (left != right) {
|
|
902
|
+
int64_t mid = 0;
|
|
903
|
+
|
|
904
|
+
// If either search window is small or we've bad numerous bad guesses, then
|
|
905
|
+
// fallback to binary search
|
|
906
|
+
seek_failed = (right - left <= kGuardLen) ||
|
|
907
|
+
continuous_poor_searches >= kMaxPoorSearches;
|
|
908
|
+
|
|
909
|
+
if (!seek_failed) {
|
|
910
|
+
// Interpolation seek reads left and right boundaries anyways, so we can
|
|
911
|
+
// set left = 0. The invariant that left <= target is still held because
|
|
912
|
+
// we early exit if left > target for the first iteration.
|
|
913
|
+
const uint32_t usable_left =
|
|
914
|
+
static_cast<uint32_t>(std::max<int64_t>(left, 0));
|
|
915
|
+
|
|
916
|
+
// First iteration: decode both boundary keys and compute shared prefix.
|
|
917
|
+
if (first_iter) {
|
|
918
|
+
if (!GetRestartKey<DecodeKeyFunc>(usable_left, &left_key)) {
|
|
919
|
+
return false;
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
|
|
923
|
+
&right_key)) {
|
|
924
|
+
return false;
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
// Compute the shared prefix length between the user key portions of
|
|
928
|
+
// the boundary keys. This is used to "normalize" the values calculated
|
|
929
|
+
// during interpolation search.
|
|
930
|
+
shared_user_prefix_len = left_key.difference_offset(right_key);
|
|
931
|
+
if (!is_user_key) {
|
|
932
|
+
// Ensure shared_user_prefix_len is only limited to user key. Suppose
|
|
933
|
+
// that the shared prefix of both keys are extended into the internal
|
|
934
|
+
// footer. If they are not the same user keys, then it is guaranteed
|
|
935
|
+
// left is the shorter one due to bytewise comparator. For reverse
|
|
936
|
+
// bytewise, this would be flipped.
|
|
937
|
+
shared_user_prefix_len = std::min<size_t>(
|
|
938
|
+
shared_user_prefix_len, left_key.size() - kNumInternalBytes);
|
|
939
|
+
assert(shared_user_prefix_len <=
|
|
940
|
+
right_key.size() - kNumInternalBytes);
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
left_val =
|
|
944
|
+
ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
|
|
945
|
+
right_val =
|
|
946
|
+
ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
|
|
947
|
+
target_val =
|
|
948
|
+
ReadBe64FromKey(target, is_user_key, shared_user_prefix_len);
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
assert(shared_user_prefix_len <= left_key.size() &&
|
|
952
|
+
shared_user_prefix_len <= right_key.size());
|
|
953
|
+
|
|
954
|
+
if (first_iter && shared_user_prefix_len > 0) {
|
|
955
|
+
// It is not guaranteed that the shared_prefix of the left and right
|
|
956
|
+
// boundaries is a valid prefix of the target. If it is not, then we can
|
|
957
|
+
// early exit.
|
|
958
|
+
size_t cmp_len =
|
|
959
|
+
std::min(target_user_key.size(), shared_user_prefix_len);
|
|
960
|
+
int cmp = memcmp(target_user_key.data(), left_key.data(), cmp_len);
|
|
961
|
+
if (cmp < 0 || (cmp == 0 && cmp_len < shared_user_prefix_len)) {
|
|
962
|
+
#ifndef NDEBUG
|
|
963
|
+
IterKey tmp_key;
|
|
964
|
+
tmp_key.SetIsUserKey(is_user_key);
|
|
965
|
+
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
|
|
966
|
+
assert(CompareKey(tmp_key, target) >= 0);
|
|
967
|
+
#endif
|
|
968
|
+
// if target size is less than shared_prefix length, and cmp == 0,
|
|
969
|
+
// then it is guaranteed <= left
|
|
970
|
+
*skip_linear_scan = true;
|
|
971
|
+
*index = usable_left;
|
|
972
|
+
return true;
|
|
973
|
+
} else if (cmp > 0) {
|
|
974
|
+
#ifndef NDEBUG
|
|
975
|
+
IterKey tmp_key;
|
|
976
|
+
tmp_key.SetIsUserKey(is_user_key);
|
|
977
|
+
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
|
|
978
|
+
assert(CompareKey(tmp_key, target) < 0);
|
|
979
|
+
#endif
|
|
980
|
+
*index = static_cast<uint32_t>(right);
|
|
981
|
+
return true;
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
assert(shared_user_prefix_len <= target_user_key.size());
|
|
986
|
+
assert(memcmp(left_key.data(), target_user_key.data(),
|
|
987
|
+
shared_user_prefix_len) == 0);
|
|
988
|
+
assert(memcmp(right_key.data(), target_user_key.data(),
|
|
989
|
+
shared_user_prefix_len) == 0);
|
|
990
|
+
|
|
991
|
+
if (first_iter) {
|
|
992
|
+
left_key_suffix = Slice(left_key.data() + shared_user_prefix_len,
|
|
993
|
+
left_key.size() - shared_user_prefix_len);
|
|
994
|
+
right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
|
|
995
|
+
right_key.size() - shared_user_prefix_len);
|
|
996
|
+
target_suffix = Slice(target.data() + shared_user_prefix_len,
|
|
997
|
+
target.size() - shared_user_prefix_len);
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
if (left_val > right_val) {
|
|
1001
|
+
CorruptionError("left key is greater than right key");
|
|
1002
|
+
return false;
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
bool lte_left = false;
|
|
1006
|
+
bool gt_right = false;
|
|
1007
|
+
|
|
1008
|
+
if (target_val < left_val) {
|
|
1009
|
+
assert(first_iter);
|
|
1010
|
+
assert(CompareKey(left_key_suffix, target_suffix) > 0);
|
|
1011
|
+
lte_left = true;
|
|
1012
|
+
} else if (target_val == left_val) {
|
|
1013
|
+
// target_val == left_val doesn't imply target == left_key
|
|
1014
|
+
// because ReadBe64FromKey only reads 8 bytes and skips sequence
|
|
1015
|
+
// numbers. We need to check actual key order.
|
|
1016
|
+
if (CompareKey(left_key_suffix, target_suffix) >= 0) {
|
|
1017
|
+
assert(first_iter);
|
|
1018
|
+
lte_left = true;
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
if (!lte_left && !seek_failed) {
|
|
1023
|
+
if (target_val > right_val) {
|
|
1024
|
+
// note that we only ever guarantee arr[target] < arr[right + 1], so
|
|
1025
|
+
// it is possible to end up here even on non-first iteration
|
|
1026
|
+
assert(CompareKey(right_key_suffix, target_suffix) < 0);
|
|
1027
|
+
gt_right = true;
|
|
1028
|
+
} else if (right_val == left_val) {
|
|
1029
|
+
// cannot divide by 0
|
|
1030
|
+
seek_failed = true;
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// early exit if key is not within bounds
|
|
1035
|
+
if (lte_left) {
|
|
1036
|
+
#ifndef NDEBUG
|
|
1037
|
+
assert(!seek_failed);
|
|
1038
|
+
IterKey tmp_key;
|
|
1039
|
+
tmp_key.SetIsUserKey(is_user_key);
|
|
1040
|
+
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
|
|
1041
|
+
assert(CompareKey(tmp_key, target) >= 0);
|
|
1042
|
+
#endif
|
|
1043
|
+
*skip_linear_scan = true;
|
|
1044
|
+
*index = usable_left;
|
|
1045
|
+
return true;
|
|
1046
|
+
}
|
|
1047
|
+
if (gt_right) {
|
|
1048
|
+
#ifndef NDEBUG
|
|
1049
|
+
assert(!seek_failed);
|
|
1050
|
+
IterKey tmp_key;
|
|
1051
|
+
tmp_key.SetIsUserKey(is_user_key);
|
|
1052
|
+
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
|
|
1053
|
+
assert(CompareKey(tmp_key, target) < 0);
|
|
1054
|
+
#endif
|
|
1055
|
+
*index = static_cast<uint32_t>(right);
|
|
1056
|
+
return true;
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
if (!seek_failed) {
|
|
1060
|
+
#ifdef HAVE_UINT128_EXTENSION
|
|
1061
|
+
__uint128_t range = right - usable_left;
|
|
1062
|
+
__uint128_t target_delta = target_val - left_val;
|
|
1063
|
+
uint64_t range_delta = right_val - left_val;
|
|
1064
|
+
int64_t offset =
|
|
1065
|
+
static_cast<int64_t>(range * target_delta / range_delta);
|
|
1066
|
+
#else
|
|
1067
|
+
double ratio = static_cast<double>(target_val - left_val) /
|
|
1068
|
+
static_cast<double>(right_val - left_val);
|
|
1069
|
+
assert(0 <= ratio && ratio <= 1);
|
|
1070
|
+
int64_t range = right - usable_left;
|
|
1071
|
+
int64_t offset = static_cast<int64_t>(range * ratio);
|
|
1072
|
+
#endif
|
|
1073
|
+
left = usable_left; // can reduce search space by 1
|
|
1074
|
+
mid = usable_left + offset;
|
|
1075
|
+
assert(mid <= right);
|
|
1076
|
+
if (mid == usable_left) {
|
|
1077
|
+
// this is to guarantee progress and avoid infinite loop
|
|
1078
|
+
++mid;
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
if (seek_failed) {
|
|
1084
|
+
// Fallback to binary seek
|
|
1085
|
+
mid = left + (right - left + 1) / 2;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
assert(left < mid && mid <= right);
|
|
1089
|
+
|
|
1090
|
+
Slice mid_key;
|
|
1091
|
+
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
|
|
1092
|
+
return false;
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
Slice mid_key_suffix(mid_key.data() + shared_user_prefix_len,
|
|
1096
|
+
mid_key.size() - shared_user_prefix_len);
|
|
1097
|
+
|
|
1098
|
+
UpdateRawKeyAndMaybePadMinTimestamp(mid_key_suffix);
|
|
1099
|
+
int cmp = CompareCurrentKey(target_suffix);
|
|
1100
|
+
|
|
1101
|
+
int64_t previous_search_space = right - left;
|
|
1102
|
+
if (cmp < 0) {
|
|
1103
|
+
left = mid;
|
|
1104
|
+
left_key = mid_key;
|
|
1105
|
+
left_key_suffix = mid_key_suffix;
|
|
1106
|
+
left_val = ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
|
|
1107
|
+
} else if (cmp > 0) {
|
|
1108
|
+
right = mid - 1;
|
|
1109
|
+
if (!seek_failed && left != right) {
|
|
1110
|
+
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
|
|
1111
|
+
&right_key)) {
|
|
1112
|
+
return false;
|
|
1113
|
+
}
|
|
1114
|
+
right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
|
|
1115
|
+
right_key.size() - shared_user_prefix_len);
|
|
1116
|
+
right_val =
|
|
1117
|
+
ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
|
|
1118
|
+
}
|
|
1119
|
+
} else {
|
|
1120
|
+
*skip_linear_scan = true;
|
|
1121
|
+
left = right = mid;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
// If seach space is not reduced by at least half, good chance this data is
|
|
1125
|
+
// not uniform.
|
|
1126
|
+
int64_t new_search_space = right - left;
|
|
1127
|
+
if (new_search_space > previous_search_space / 2) {
|
|
1128
|
+
++continuous_poor_searches;
|
|
1129
|
+
} else {
|
|
1130
|
+
continuous_poor_searches = 0;
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
first_iter = false;
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
if (left == -1) {
|
|
1137
|
+
// All keys in the block were strictly greater than `target`. So the very
|
|
1138
|
+
// first key in the block is the final seek result.
|
|
1139
|
+
*skip_linear_scan = true;
|
|
1140
|
+
*index = 0;
|
|
1141
|
+
} else {
|
|
1142
|
+
*index = static_cast<uint32_t>(left);
|
|
1143
|
+
}
|
|
1144
|
+
return true;
|
|
1145
|
+
}
|
|
1146
|
+
|
|
888
1147
|
// Compare target key and the block key of the block of `block_index`.
|
|
889
1148
|
// Return -1 if error.
|
|
890
1149
|
int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
&non_shared)
|
|
897
|
-
: DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
|
|
898
|
-
&non_shared);
|
|
899
|
-
if (key_ptr == nullptr || (shared != 0)) {
|
|
900
|
-
CorruptionError();
|
|
1150
|
+
Slice block_key;
|
|
1151
|
+
bool ok = value_delta_encoded_
|
|
1152
|
+
? GetRestartKey<DecodeKeyV4>(block_index, &block_key)
|
|
1153
|
+
: GetRestartKey<DecodeKey>(block_index, &block_key);
|
|
1154
|
+
if (!ok) {
|
|
901
1155
|
return 1; // Return target is smaller
|
|
902
1156
|
}
|
|
903
|
-
Slice block_key(key_ptr, non_shared);
|
|
904
1157
|
UpdateRawKeyAndMaybePadMinTimestamp(block_key);
|
|
905
1158
|
return CompareCurrentKey(target);
|
|
906
1159
|
}
|
|
@@ -949,7 +1202,7 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
|
|
|
949
1202
|
if (block_ids[left] > 0 &&
|
|
950
1203
|
(left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
|
|
951
1204
|
CompareBlockKey(block_ids[left] - 1, target) > 0) {
|
|
952
|
-
current_ =
|
|
1205
|
+
current_ = GetKeysEndOffset();
|
|
953
1206
|
*prefix_may_exist = false;
|
|
954
1207
|
return false;
|
|
955
1208
|
}
|
|
@@ -986,7 +1239,7 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
|
|
|
986
1239
|
}
|
|
987
1240
|
|
|
988
1241
|
// Mark iterator invalid
|
|
989
|
-
current_ =
|
|
1242
|
+
current_ = GetKeysEndOffset();
|
|
990
1243
|
return false;
|
|
991
1244
|
}
|
|
992
1245
|
}
|
|
@@ -1005,7 +1258,7 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
|
|
|
1005
1258
|
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
|
|
1006
1259
|
|
|
1007
1260
|
if (num_blocks == 0) {
|
|
1008
|
-
current_ =
|
|
1261
|
+
current_ = GetKeysEndOffset();
|
|
1009
1262
|
*prefix_may_exist = false;
|
|
1010
1263
|
return false;
|
|
1011
1264
|
} else {
|
|
@@ -1015,39 +1268,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
|
|
|
1015
1268
|
}
|
|
1016
1269
|
}
|
|
1017
1270
|
|
|
1018
|
-
uint32_t Block::NumRestarts() const {
|
|
1019
|
-
assert(size() >= 2 * sizeof(uint32_t));
|
|
1020
|
-
uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
|
|
1021
|
-
uint32_t num_restarts = block_footer;
|
|
1022
|
-
if (size() > kMaxBlockSizeSupportedByHashIndex) {
|
|
1023
|
-
// In BlockBuilder, we have ensured a block with HashIndex is less than
|
|
1024
|
-
// kMaxBlockSizeSupportedByHashIndex (64KiB).
|
|
1025
|
-
//
|
|
1026
|
-
// Therefore, if we encounter a block with a size > 64KiB, the block
|
|
1027
|
-
// cannot have HashIndex. So the footer will directly interpreted as
|
|
1028
|
-
// num_restarts.
|
|
1029
|
-
//
|
|
1030
|
-
// Such check is for backward compatibility. We can ensure legacy block
|
|
1031
|
-
// with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
|
|
1032
|
-
// correctly as no HashIndex even if the MSB of num_restarts is set.
|
|
1033
|
-
return num_restarts;
|
|
1034
|
-
}
|
|
1035
|
-
BlockBasedTableOptions::DataBlockIndexType index_type;
|
|
1036
|
-
UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
|
|
1037
|
-
return num_restarts;
|
|
1038
|
-
}
|
|
1039
|
-
|
|
1040
1271
|
BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
|
|
1041
|
-
assert(size() >=
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
|
|
1047
|
-
uint32_t num_restarts = block_footer;
|
|
1048
|
-
BlockBasedTableOptions::DataBlockIndexType index_type;
|
|
1049
|
-
UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
|
|
1050
|
-
return index_type;
|
|
1272
|
+
assert(size() >= DataBlockFooter::kMinEncodedLength);
|
|
1273
|
+
Slice input(data(), size());
|
|
1274
|
+
DataBlockFooter footer;
|
|
1275
|
+
footer.DecodeFrom(&input).PermitUncheckedError();
|
|
1276
|
+
return footer.index_type;
|
|
1051
1277
|
}
|
|
1052
1278
|
|
|
1053
1279
|
Block::~Block() {
|
|
@@ -1057,51 +1283,83 @@ Block::~Block() {
|
|
|
1057
1283
|
delete[] kv_checksum_;
|
|
1058
1284
|
}
|
|
1059
1285
|
|
|
1286
|
+
Status Block::GetCorruptionStatus() const {
|
|
1287
|
+
// Re-process the footer to get a detailed error status.
|
|
1288
|
+
// This should only be called when size() == 0 (error marker).
|
|
1289
|
+
assert(size() == 0);
|
|
1290
|
+
// When size() == 0 and restart_offset_ != 0, restart_offset_ stores the
|
|
1291
|
+
// original data size for re-decoding the footer to get detailed error.
|
|
1292
|
+
if (restart_offset_ == 0) {
|
|
1293
|
+
return Status::Corruption("bad block contents");
|
|
1294
|
+
}
|
|
1295
|
+
Slice input(contents_.data.data(), restart_offset_);
|
|
1296
|
+
DataBlockFooter footer;
|
|
1297
|
+
Status s = footer.DecodeFrom(&input);
|
|
1298
|
+
if (!s.ok()) {
|
|
1299
|
+
return s; // Return the detailed error from DecodeFrom
|
|
1300
|
+
}
|
|
1301
|
+
// Footer decoded OK, so error was in later processing (shouldn't happen)
|
|
1302
|
+
DEBUG_FAIL("ok status on presumed bad block contents");
|
|
1303
|
+
return Status::Corruption("presumed bad block contents");
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1060
1306
|
Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
|
|
1061
|
-
Statistics* statistics)
|
|
1062
|
-
: contents_(std::move(contents)),
|
|
1307
|
+
Statistics* statistics, uint32_t restart_interval)
|
|
1308
|
+
: contents_(std::move(contents)),
|
|
1309
|
+
restart_offset_(0),
|
|
1310
|
+
num_restarts_(0),
|
|
1311
|
+
block_restart_interval_(restart_interval) {
|
|
1063
1312
|
TEST_SYNC_POINT("Block::Block:0");
|
|
1064
1313
|
auto& size = contents_.data.size_;
|
|
1065
|
-
|
|
1314
|
+
// `contents` is assumed to be uncompressed in the proper format
|
|
1315
|
+
Slice input(contents_.data.data(), size);
|
|
1316
|
+
DataBlockFooter footer;
|
|
1317
|
+
Status s = footer.DecodeFrom(&input);
|
|
1318
|
+
if (!s.ok()) {
|
|
1319
|
+
// Save original size for GetCorruptionStatus() to re-decode footer
|
|
1320
|
+
restart_offset_ = static_cast<uint32_t>(size);
|
|
1066
1321
|
size = 0; // Error marker
|
|
1067
1322
|
} else {
|
|
1068
|
-
//
|
|
1069
|
-
|
|
1070
|
-
|
|
1323
|
+
// After DecodeFrom, input has the footer (and values_section_offset if
|
|
1324
|
+
// separated_kv) removed. Each case below may strip additional suffix
|
|
1325
|
+
// (e.g., hash index) so that input ends with just the restart array.
|
|
1326
|
+
num_restarts_ = footer.num_restarts;
|
|
1327
|
+
is_uniform_ = footer.is_uniform;
|
|
1328
|
+
switch (footer.index_type) {
|
|
1071
1329
|
case BlockBasedTableOptions::kDataBlockBinarySearch:
|
|
1072
|
-
restart_offset_ = static_cast<uint32_t>(size) -
|
|
1073
|
-
(1 + num_restarts_) * sizeof(uint32_t);
|
|
1074
|
-
if (restart_offset_ > size - sizeof(uint32_t)) {
|
|
1075
|
-
// The size is too small for NumRestarts() and therefore
|
|
1076
|
-
// restart_offset_ wrapped around.
|
|
1077
|
-
size = 0;
|
|
1078
|
-
}
|
|
1079
1330
|
break;
|
|
1080
1331
|
case BlockBasedTableOptions::kDataBlockBinaryAndHash:
|
|
1081
|
-
if (size < sizeof(
|
|
1082
|
-
sizeof(uint16_t) /* NUM_BUCK */) {
|
|
1332
|
+
if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) {
|
|
1083
1333
|
size = 0;
|
|
1084
1334
|
break;
|
|
1085
1335
|
}
|
|
1086
|
-
|
|
1087
1336
|
uint16_t map_offset;
|
|
1088
|
-
data_block_hash_index_.Initialize(
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
|
|
1094
|
-
|
|
1095
|
-
if (restart_offset_ > map_offset) {
|
|
1096
|
-
// map_offset is too small for NumRestarts() and
|
|
1097
|
-
// therefore restart_offset_ wrapped around.
|
|
1098
|
-
size = 0;
|
|
1099
|
-
break;
|
|
1100
|
-
}
|
|
1337
|
+
data_block_hash_index_.Initialize(contents_.data.data(),
|
|
1338
|
+
static_cast<uint16_t>(input.size()),
|
|
1339
|
+
&map_offset);
|
|
1340
|
+
// Strip the hash index, leaving just data + restarts
|
|
1341
|
+
input.remove_suffix(input.size() - map_offset);
|
|
1101
1342
|
break;
|
|
1102
1343
|
default:
|
|
1103
1344
|
size = 0; // Error marker
|
|
1104
1345
|
}
|
|
1346
|
+
// After the switch, input should end with restarts[num_restarts_]
|
|
1347
|
+
if (size != 0) {
|
|
1348
|
+
if (input.size() < num_restarts_ * sizeof(uint32_t)) {
|
|
1349
|
+
size = 0; // Block too small for the declared number of restarts
|
|
1350
|
+
} else {
|
|
1351
|
+
restart_offset_ = static_cast<uint32_t>(input.size()) -
|
|
1352
|
+
num_restarts_ * sizeof(uint32_t);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
// Set up values_section_ from footer if separated KV storage is used
|
|
1356
|
+
if (size != 0 && footer.separated_kv) {
|
|
1357
|
+
if (footer.values_section_offset > restart_offset_) {
|
|
1358
|
+
size = 0; // Error marker
|
|
1359
|
+
} else {
|
|
1360
|
+
values_section_ = data() + footer.values_section_offset;
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1105
1363
|
}
|
|
1106
1364
|
if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
|
|
1107
1365
|
read_amp_bitmap_.reset(new BlockReadAmpBitmap(
|
|
@@ -1125,7 +1383,10 @@ void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
|
|
|
1125
1383
|
nullptr /* stats */, true /* block_contents_pinned */,
|
|
1126
1384
|
true /* user_defined_timestamps_persisted */)};
|
|
1127
1385
|
if (iter->status().ok()) {
|
|
1128
|
-
|
|
1386
|
+
// Only calculate restart interval if not already set via table properties
|
|
1387
|
+
if (block_restart_interval_ == 0) {
|
|
1388
|
+
block_restart_interval_ = iter->GetRestartInterval();
|
|
1389
|
+
}
|
|
1129
1390
|
}
|
|
1130
1391
|
uint32_t num_keys = 0;
|
|
1131
1392
|
if (iter->status().ok()) {
|
|
@@ -1158,12 +1419,12 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
|
|
|
1158
1419
|
bool index_has_first_key) {
|
|
1159
1420
|
protection_bytes_per_key_ = 0;
|
|
1160
1421
|
if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
|
|
1161
|
-
// Note that `global_seqno` and `key_includes_seq` are hardcoded here.
|
|
1162
|
-
// do not impact how the index block is parsed. During checksum
|
|
1422
|
+
// Note that `global_seqno` and `key_includes_seq` are hardcoded here.
|
|
1423
|
+
// They do not impact how the index block is parsed. During checksum
|
|
1163
1424
|
// construction/verification, we use the entire key buffer from
|
|
1164
|
-
// raw_key_.GetKey() returned by iter->key() as the `key` part of
|
|
1165
|
-
// checksum, and the content of this buffer do not change for
|
|
1166
|
-
// values of `global_seqno` or `key_includes_seq`.
|
|
1425
|
+
// raw_key_.GetKey() returned by iter->key() as the `key` part of
|
|
1426
|
+
// key-value checksum, and the content of this buffer do not change for
|
|
1427
|
+
// different values of `global_seqno` or `key_includes_seq`.
|
|
1167
1428
|
// TODO(yuzhangyu): handle the implication of padding timestamp for kv
|
|
1168
1429
|
// protection.
|
|
1169
1430
|
std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
|
|
@@ -1174,7 +1435,10 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
|
|
|
1174
1435
|
true /* user_defined_timestamps_persisted*/,
|
|
1175
1436
|
nullptr /* prefix_index */)};
|
|
1176
1437
|
if (iter->status().ok()) {
|
|
1177
|
-
|
|
1438
|
+
// Only calculate restart interval if not already set via table properties
|
|
1439
|
+
if (block_restart_interval_ == 0) {
|
|
1440
|
+
block_restart_interval_ = iter->GetRestartInterval();
|
|
1441
|
+
}
|
|
1178
1442
|
}
|
|
1179
1443
|
uint32_t num_keys = 0;
|
|
1180
1444
|
if (iter->status().ok()) {
|
|
@@ -1238,7 +1502,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
|
|
|
1238
1502
|
MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
|
|
1239
1503
|
MetaBlockIter* iter = new MetaBlockIter();
|
|
1240
1504
|
if (size() < 2 * sizeof(uint32_t)) {
|
|
1241
|
-
iter->Invalidate(
|
|
1505
|
+
iter->Invalidate(GetCorruptionStatus());
|
|
1242
1506
|
return iter;
|
|
1243
1507
|
} else if (num_restarts_ == 0) {
|
|
1244
1508
|
// Empty block.
|
|
@@ -1246,7 +1510,7 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
|
|
|
1246
1510
|
} else {
|
|
1247
1511
|
iter->Initialize(data(), restart_offset_, num_restarts_,
|
|
1248
1512
|
block_contents_pinned, protection_bytes_per_key_,
|
|
1249
|
-
kv_checksum_, block_restart_interval_);
|
|
1513
|
+
kv_checksum_, block_restart_interval_, values_section_);
|
|
1250
1514
|
}
|
|
1251
1515
|
return iter;
|
|
1252
1516
|
}
|
|
@@ -1263,7 +1527,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
|
|
|
1263
1527
|
ret_iter = new DataBlockIter;
|
|
1264
1528
|
}
|
|
1265
1529
|
if (size() < 2 * sizeof(uint32_t)) {
|
|
1266
|
-
ret_iter->Invalidate(
|
|
1530
|
+
ret_iter->Invalidate(GetCorruptionStatus());
|
|
1267
1531
|
return ret_iter;
|
|
1268
1532
|
}
|
|
1269
1533
|
if (num_restarts_ == 0) {
|
|
@@ -1276,10 +1540,12 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
|
|
|
1276
1540
|
read_amp_bitmap_.get(), block_contents_pinned,
|
|
1277
1541
|
user_defined_timestamps_persisted,
|
|
1278
1542
|
data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
|
|
1279
|
-
protection_bytes_per_key_, kv_checksum_, block_restart_interval_
|
|
1543
|
+
protection_bytes_per_key_, kv_checksum_, block_restart_interval_,
|
|
1544
|
+
values_section_);
|
|
1280
1545
|
if (read_amp_bitmap_) {
|
|
1281
1546
|
if (read_amp_bitmap_->GetStatistics() != stats) {
|
|
1282
|
-
// DB changed the Statistics pointer, we need to notify
|
|
1547
|
+
// DB changed the Statistics pointer, we need to notify
|
|
1548
|
+
// read_amp_bitmap_
|
|
1283
1549
|
read_amp_bitmap_->SetStatistics(stats);
|
|
1284
1550
|
}
|
|
1285
1551
|
}
|
|
@@ -1293,7 +1559,8 @@ IndexBlockIter* Block::NewIndexIterator(
|
|
|
1293
1559
|
IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
|
|
1294
1560
|
bool have_first_key, bool key_includes_seq, bool value_is_full,
|
|
1295
1561
|
bool block_contents_pinned, bool user_defined_timestamps_persisted,
|
|
1296
|
-
BlockPrefixIndex* prefix_index
|
|
1562
|
+
BlockPrefixIndex* prefix_index,
|
|
1563
|
+
BlockBasedTableOptions::BlockSearchType index_block_search_type) {
|
|
1297
1564
|
IndexBlockIter* ret_iter;
|
|
1298
1565
|
if (iter != nullptr) {
|
|
1299
1566
|
ret_iter = iter;
|
|
@@ -1301,7 +1568,7 @@ IndexBlockIter* Block::NewIndexIterator(
|
|
|
1301
1568
|
ret_iter = new IndexBlockIter;
|
|
1302
1569
|
}
|
|
1303
1570
|
if (size() < 2 * sizeof(uint32_t)) {
|
|
1304
|
-
ret_iter->Invalidate(
|
|
1571
|
+
ret_iter->Invalidate(GetCorruptionStatus());
|
|
1305
1572
|
return ret_iter;
|
|
1306
1573
|
}
|
|
1307
1574
|
if (num_restarts_ == 0) {
|
|
@@ -1311,11 +1578,23 @@ IndexBlockIter* Block::NewIndexIterator(
|
|
|
1311
1578
|
} else {
|
|
1312
1579
|
BlockPrefixIndex* prefix_index_ptr =
|
|
1313
1580
|
total_order_seek ? nullptr : prefix_index;
|
|
1581
|
+
|
|
1582
|
+
// Resolve kAuto to a concrete search type based on the block's
|
|
1583
|
+
// uniformity flag. Interpolation search requires bytewise comparator;
|
|
1584
|
+
// fall back to binary search otherwise.
|
|
1585
|
+
auto resolved_search_type = index_block_search_type;
|
|
1586
|
+
if (resolved_search_type == BlockBasedTableOptions::kAuto) {
|
|
1587
|
+
resolved_search_type = (is_uniform_ && raw_ucmp == BytewiseComparator())
|
|
1588
|
+
? BlockBasedTableOptions::kInterpolation
|
|
1589
|
+
: BlockBasedTableOptions::kBinary;
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1314
1592
|
ret_iter->Initialize(
|
|
1315
1593
|
raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
|
|
1316
1594
|
prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
|
|
1317
1595
|
block_contents_pinned, user_defined_timestamps_persisted,
|
|
1318
|
-
protection_bytes_per_key_, kv_checksum_, block_restart_interval_
|
|
1596
|
+
protection_bytes_per_key_, kv_checksum_, block_restart_interval_,
|
|
1597
|
+
values_section_, resolved_search_type);
|
|
1319
1598
|
}
|
|
1320
1599
|
|
|
1321
1600
|
return ret_iter;
|