@nxtedition/rocksdb 15.4.1 → 15.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +24 -15
- package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
- package/deps/rocksdb/rocksdb/BUCK +42 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
- package/deps/rocksdb/rocksdb/Makefile +59 -32
- package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
- package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
- package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
- package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
- package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
- package/deps/rocksdb/rocksdb/db/builder.h +7 -0
- package/deps/rocksdb/rocksdb/db/c.cc +373 -57
- package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
- package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
- package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
- package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
- package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
- package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
- package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
- package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
- package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
- package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
- package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
- package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
- package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
- package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
- package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
- package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
- package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
- package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
- package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
- package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
- package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
- package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
- package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
- package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
- package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
- package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
- package/deps/rocksdb/rocksdb/env/env.cc +1 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
- package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
- package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
- package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
- package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
- package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
- package/deps/rocksdb/rocksdb/folly.mk +22 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
- package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
- package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
- package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
- package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
- package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
- package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
- package/deps/rocksdb/rocksdb/options/options.cc +5 -1
- package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
- package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
- package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
- package/deps/rocksdb/rocksdb/port/lang.h +4 -0
- package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
- package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
- package/deps/rocksdb/rocksdb/src.mk +12 -0
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
- package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
- package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
- package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
- package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
- package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
- package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
- package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
- package/deps/rocksdb/rocksdb/table/format.cc +27 -15
- package/deps/rocksdb/rocksdb/table/format.h +41 -15
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
- package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
- package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
- package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
- package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
- package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
- package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
- package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
- package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
- package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
- package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
- package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
- package/deps/rocksdb/rocksdb/util/coding.h +14 -27
- package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
- package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
- package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
- package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
- package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
- package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
- package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
- package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
- package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
- package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
- package/deps/rocksdb/rocksdb/util/math.h +3 -1
- package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
- package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
- package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
- package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
- package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
- package/deps/rocksdb/rocksdb/util/status.cc +3 -1
- package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
- package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
- package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
- package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
- package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
- package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
- package/deps/rocksdb/rocksdb.gyp +7 -0
- package/iterator.js +2 -2
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
|
@@ -0,0 +1,2575 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
3
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
5
|
+
|
|
6
|
+
#include "utilities/trie_index/louds_trie.h"
|
|
7
|
+
|
|
8
|
+
#include <algorithm>
|
|
9
|
+
#include <cassert>
|
|
10
|
+
#include <cstring>
|
|
11
|
+
#include <limits>
|
|
12
|
+
#include <utility>
|
|
13
|
+
|
|
14
|
+
#include "util/coding.h"
|
|
15
|
+
|
|
16
|
+
namespace ROCKSDB_NAMESPACE {
|
|
17
|
+
namespace trie_index {
|
|
18
|
+
|
|
19
|
+
static constexpr uint32_t kTrieFormatVersion = 1;
|
|
20
|
+
static constexpr uint32_t kTrieMagic = 0x54524945; // "TRIE" in ASCII
|
|
21
|
+
|
|
22
|
+
// Header flags. The flags field is a 4-byte bitmask stored after
|
|
23
|
+
// dense_child_count in the serialized header.
|
|
24
|
+
static constexpr uint32_t kFlagSeqnoEncoding = 1u << 0;
|
|
25
|
+
|
|
26
|
+
// ============================================================================
|
|
27
|
+
// LoudsTrieBuilder implementation
|
|
28
|
+
// ============================================================================
|
|
29
|
+
|
|
30
|
+
LoudsTrieBuilder::LoudsTrieBuilder()
|
|
31
|
+
: cutoff_level_(0),
|
|
32
|
+
max_depth_(0),
|
|
33
|
+
dense_leaf_count_(0),
|
|
34
|
+
dense_node_count_(0),
|
|
35
|
+
dense_child_count_(0),
|
|
36
|
+
has_seqno_encoding_(false) {}
|
|
37
|
+
|
|
38
|
+
void LoudsTrieBuilder::AddKey(const Slice& key, const TrieBlockHandle& handle) {
|
|
39
|
+
keys_.emplace_back(key.data(), key.size());
|
|
40
|
+
handles_.push_back(handle);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
void LoudsTrieBuilder::AddKeyWithSeqno(const Slice& key,
|
|
44
|
+
const TrieBlockHandle& handle,
|
|
45
|
+
uint64_t seqno, uint32_t block_count) {
|
|
46
|
+
keys_.emplace_back(key.data(), key.size());
|
|
47
|
+
handles_.push_back(handle);
|
|
48
|
+
seqnos_.push_back(seqno);
|
|
49
|
+
block_counts_.push_back(block_count);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
void LoudsTrieBuilder::AddOverflowBlock(const TrieBlockHandle& handle,
|
|
53
|
+
uint64_t seqno) {
|
|
54
|
+
// Seqno may be 0 when bottommost compaction zeroes all sequence numbers.
|
|
55
|
+
// In that case, every block in the same-key run has seqno=0. The reader's
|
|
56
|
+
// post-seek correction handles this correctly: the primary leaf's seqno=0
|
|
57
|
+
// triggers the "never advance" guard (leaf_seqno != 0 check), so the seek
|
|
58
|
+
// returns the primary block. Next() iterates overflow blocks by index, not
|
|
59
|
+
// seqno, so all blocks are still visited in order.
|
|
60
|
+
overflow_handles_.push_back(handle);
|
|
61
|
+
overflow_seqnos_.push_back(seqno);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Compute the optimal cutoff level between dense and sparse encoding.
|
|
65
|
+
// Merges the node-per-level and label-per-level computations into a single
|
|
66
|
+
// pass over the key set, avoiding the redundant LCP computation that would
|
|
67
|
+
// occur if they were computed separately.
|
|
68
|
+
uint32_t LoudsTrieBuilder::ComputeCutoffLevel() const {
|
|
69
|
+
if (keys_.empty()) {
|
|
70
|
+
return 0;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
std::vector<uint64_t> nodes_per_level(max_depth_ + 1, 0);
|
|
74
|
+
std::vector<uint64_t> labels_per_level(max_depth_ + 1, 0);
|
|
75
|
+
nodes_per_level[0] = 1;
|
|
76
|
+
|
|
77
|
+
for (size_t i = 0; i < keys_.size(); i++) {
|
|
78
|
+
// Compute LCP with previous key (once per key pair).
|
|
79
|
+
uint32_t lcp = 0;
|
|
80
|
+
if (i > 0) {
|
|
81
|
+
const auto& prev = keys_[i - 1];
|
|
82
|
+
const auto& curr = keys_[i];
|
|
83
|
+
uint32_t min_len =
|
|
84
|
+
static_cast<uint32_t>(std::min(prev.size(), curr.size()));
|
|
85
|
+
while (lcp < min_len && prev[lcp] == curr[lcp]) {
|
|
86
|
+
lcp++;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
uint32_t key_len = static_cast<uint32_t>(keys_[i].size());
|
|
91
|
+
|
|
92
|
+
// New nodes: levels (lcp+1) through key_len (same as before).
|
|
93
|
+
for (uint32_t l = lcp + 1; l <= key_len && l <= max_depth_; l++) {
|
|
94
|
+
nodes_per_level[l]++;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Labels: a label exists at level l if the key has a character at l
|
|
98
|
+
// and this label is distinct from the previous key's label (lcp <= l).
|
|
99
|
+
// For the first key (i==0), all levels contribute a label.
|
|
100
|
+
// Labels: count distinct labels per level. For sorted keys, a label at
|
|
101
|
+
// level l is "new" only if l >= lcp (the shared prefix with the previous
|
|
102
|
+
// key). For the first key, all levels contribute a new label.
|
|
103
|
+
uint32_t label_start = (i == 0) ? 0 : lcp;
|
|
104
|
+
for (uint32_t l = label_start; l < key_len && l <= max_depth_; l++) {
|
|
105
|
+
labels_per_level[l]++;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Find first level where sparse encoding is cheaper than dense.
|
|
110
|
+
for (uint32_t l = 0; l <= max_depth_; l++) {
|
|
111
|
+
uint64_t n = nodes_per_level[l];
|
|
112
|
+
uint64_t labels = labels_per_level[l];
|
|
113
|
+
if (n == 0) {
|
|
114
|
+
return l;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Dense: 256 bits for d_labels_ + 1 bit for d_is_prefix_key_ per node,
|
|
118
|
+
// plus 1 bit for d_has_child_ per label.
|
|
119
|
+
uint64_t dense_cost = n * 257 + labels;
|
|
120
|
+
// Sparse: 8 bits (label byte) + 1 bit (s_has_child_) + 1 bit (s_louds_)
|
|
121
|
+
// per label, plus 1 bit for s_is_prefix_key_ per node.
|
|
122
|
+
uint64_t sparse_cost = labels * 10 + n;
|
|
123
|
+
|
|
124
|
+
if (sparse_cost < dense_cost) {
|
|
125
|
+
return l;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return max_depth_ + 1;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
void LoudsTrieBuilder::Finish() {
|
|
132
|
+
if (keys_.empty()) {
|
|
133
|
+
cutoff_level_ = 0;
|
|
134
|
+
max_depth_ = 0;
|
|
135
|
+
dense_leaf_count_ = 0;
|
|
136
|
+
dense_node_count_ = 0;
|
|
137
|
+
dense_child_count_ = 0;
|
|
138
|
+
SerializeAll();
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Compute max depth.
|
|
143
|
+
max_depth_ = 0;
|
|
144
|
+
for (const auto& key : keys_) {
|
|
145
|
+
max_depth_ = std::max(max_depth_, static_cast<uint32_t>(key.size()));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Determine cutoff level.
|
|
149
|
+
cutoff_level_ = ComputeCutoffLevel();
|
|
150
|
+
cutoff_level_ = std::min(cutoff_level_, max_depth_ + 1);
|
|
151
|
+
|
|
152
|
+
// =========================================================================
|
|
153
|
+
// Build per-level trie data directly from sorted keys (streaming approach).
|
|
154
|
+
//
|
|
155
|
+
// Instead of constructing an explicit trie with per-node heap-allocated
|
|
156
|
+
// children vectors and then BFS-encoding it, we infer the trie structure
|
|
157
|
+
// directly from the sorted key sequence using LCP (longest common prefix)
|
|
158
|
+
// analysis and build flat per-level arrays. This is the approach used by
|
|
159
|
+
// the SuRF reference implementation (Zhang et al., SIGMOD 2018).
|
|
160
|
+
//
|
|
161
|
+
// Memory advantage: flat per-level vectors avoid the O(total_nodes) per-node
|
|
162
|
+
// allocation overhead. For N keys with average depth D:
|
|
163
|
+
// - Explicit tree: ~48 bytes per node x N*D nodes = ~48*N*D bytes
|
|
164
|
+
// - Per-level: ~10 bytes per label + ~17 bytes per node, all in flat
|
|
165
|
+
// vectors with a single allocation per level.
|
|
166
|
+
//
|
|
167
|
+
// Algorithm: For each key, compute LCP with the previous key. The LCP
|
|
168
|
+
// determines which levels share the trie path (existing nodes) and where
|
|
169
|
+
// new branches start. At each level l >= lcp, the key contributes a new
|
|
170
|
+
// label to the current (last) node at that level.
|
|
171
|
+
//
|
|
172
|
+
// Three key mechanisms handle deferred information:
|
|
173
|
+
// 1. Deferred internal marking: when a label is first added, we mark it
|
|
174
|
+
// as a leaf (has_child=false). If a subsequent key continues the path
|
|
175
|
+
// through this label, we retroactively mark it as internal.
|
|
176
|
+
// 2. Handle migration: when a leaf becomes internal (i.e., a prefix key
|
|
177
|
+
// emerges), the handle moves from the label's leaf_handle to the
|
|
178
|
+
// child node's prefix_handle.
|
|
179
|
+
// 3. Lazy node creation: terminal nodes at depth K (where a key of
|
|
180
|
+
// length K ends) are created on-demand when retroactive marking
|
|
181
|
+
// needs them.
|
|
182
|
+
// =========================================================================
|
|
183
|
+
|
|
184
|
+
// Per-level data: flat arrays for labels, has_child, and handle tracking.
|
|
185
|
+
// All labels within a level are concatenated; node boundaries are tracked
|
|
186
|
+
// by node_label_start (the index where each node's labels begin).
|
|
187
|
+
struct PerLevelData {
|
|
188
|
+
// ---- Per-label data (all nodes concatenated) ----
|
|
189
|
+
std::vector<uint8_t> labels;
|
|
190
|
+
// Using uint8_t instead of bool to avoid std::vector<bool> proxy issues.
|
|
191
|
+
std::vector<uint8_t> has_child;
|
|
192
|
+
// Handle index for leaf labels (-1 if internal). When has_child[i] is
|
|
193
|
+
// false, leaf_handle[i] is the key index whose handle should be emitted
|
|
194
|
+
// for this leaf child.
|
|
195
|
+
std::vector<int64_t> leaf_handle;
|
|
196
|
+
|
|
197
|
+
// ---- Per-node data ----
|
|
198
|
+
std::vector<uint8_t> is_prefix;
|
|
199
|
+
// Handle index for prefix key nodes (-1 if not a prefix key).
|
|
200
|
+
std::vector<int64_t> prefix_handle;
|
|
201
|
+
// Index in labels[] where each node's labels start. Used to derive
|
|
202
|
+
// louds bits and to iterate over nodes during LOUDS encoding.
|
|
203
|
+
std::vector<uint64_t> node_label_start;
|
|
204
|
+
|
|
205
|
+
uint64_t node_count() const {
|
|
206
|
+
return static_cast<uint64_t>(is_prefix.size());
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Start a new node. Call this before adding labels to the node.
|
|
210
|
+
void StartNode() {
|
|
211
|
+
node_label_start.push_back(static_cast<uint64_t>(labels.size()));
|
|
212
|
+
is_prefix.push_back(false);
|
|
213
|
+
prefix_handle.push_back(-1);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Add a label to the current (last) node.
|
|
217
|
+
void AddLabel(uint8_t label, bool is_internal, int64_t handle) {
|
|
218
|
+
labels.push_back(label);
|
|
219
|
+
has_child.push_back(is_internal);
|
|
220
|
+
leaf_handle.push_back(is_internal ? -1 : handle);
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
std::vector<PerLevelData> levels(max_depth_ + 1);
|
|
225
|
+
|
|
226
|
+
// Initialize root node at level 0.
|
|
227
|
+
levels[0].StartNode();
|
|
228
|
+
|
|
229
|
+
for (size_t ki = 0; ki < keys_.size(); ki++) {
|
|
230
|
+
const auto& key = keys_[ki];
|
|
231
|
+
uint32_t K = static_cast<uint32_t>(key.size());
|
|
232
|
+
|
|
233
|
+
// Compute LCP with previous key.
|
|
234
|
+
uint32_t lcp = 0;
|
|
235
|
+
if (ki > 0) {
|
|
236
|
+
const auto& prev = keys_[ki - 1];
|
|
237
|
+
uint32_t min_len =
|
|
238
|
+
static_cast<uint32_t>(std::min(prev.size(), key.size()));
|
|
239
|
+
while (lcp < min_len && prev[lcp] == key[lcp]) {
|
|
240
|
+
lcp++;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Step 1: Retroactive internal marking.
|
|
245
|
+
//
|
|
246
|
+
// If lcp > 0, the label at level lcp-1 (the deepest shared label) may
|
|
247
|
+
// need to transition from leaf to internal. This happens when:
|
|
248
|
+
// - The previous key's path ended at or before level lcp (the label
|
|
249
|
+
// was originally a leaf), AND
|
|
250
|
+
// - The current key continues deeper through the same child.
|
|
251
|
+
//
|
|
252
|
+
// Since keys are sorted and lcp is the shared prefix length, the label
|
|
253
|
+
// at level lcp-1 is always the LAST label in the current (last) node at
|
|
254
|
+
// that level. We only need to mark this one label.
|
|
255
|
+
if (lcp > 0) {
|
|
256
|
+
auto& pl = levels[lcp - 1];
|
|
257
|
+
if (!pl.has_child.empty() && !pl.has_child.back()) {
|
|
258
|
+
// Leaf → internal transition.
|
|
259
|
+
pl.has_child.back() = true;
|
|
260
|
+
|
|
261
|
+
// Handle migration: move the leaf's handle to a NEW child node at
|
|
262
|
+
// level lcp as a prefix key. The child node is always new because
|
|
263
|
+
// this label just transitioned from leaf to internal — no child
|
|
264
|
+
// node existed before for this branch.
|
|
265
|
+
//
|
|
266
|
+
// The leaf_handle must be valid (>= 0) because the label was marked
|
|
267
|
+
// as a leaf in Step 2, which always sets the handle for leaf labels.
|
|
268
|
+
assert(pl.leaf_handle.back() >= 0);
|
|
269
|
+
int64_t h = pl.leaf_handle.back();
|
|
270
|
+
pl.leaf_handle.back() = -1;
|
|
271
|
+
|
|
272
|
+
if (lcp <= max_depth_) {
|
|
273
|
+
levels[lcp].StartNode();
|
|
274
|
+
levels[lcp].is_prefix.back() = true;
|
|
275
|
+
levels[lcp].prefix_handle.back() = h;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Step 2: Add labels for levels lcp..K-1.
|
|
281
|
+
//
|
|
282
|
+
// At level lcp: add a label to the existing (last) node.
|
|
283
|
+
// At levels > lcp: create a new node and add its first label.
|
|
284
|
+
for (uint32_t l = lcp; l < K; l++) {
|
|
285
|
+
uint8_t label = static_cast<uint8_t>(key[l]);
|
|
286
|
+
bool is_internal = (l + 1 < K); // Key continues to next level.
|
|
287
|
+
|
|
288
|
+
if (l == lcp) {
|
|
289
|
+
// Adding to existing node. Verify it exists.
|
|
290
|
+
assert(levels[l].node_count() > 0);
|
|
291
|
+
} else {
|
|
292
|
+
// New child node at this level.
|
|
293
|
+
levels[l].StartNode();
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Associate handle with the last label if this is a leaf (key ends).
|
|
297
|
+
int64_t handle = is_internal ? -1 : static_cast<int64_t>(ki);
|
|
298
|
+
levels[l].AddLabel(label, is_internal, handle);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// =========================================================================
|
|
303
|
+
// Phase 2: Build LOUDS bitvectors and reorder handles from per-level data.
|
|
304
|
+
//
|
|
305
|
+
// The per-level data is already in BFS order (nodes at each level are
|
|
306
|
+
// created left-to-right as sorted keys are processed). We iterate by
|
|
307
|
+
// level and by node within each level:
|
|
308
|
+
// - For each node: emit prefix key handle (if any), then for each label:
|
|
309
|
+
// if leaf → emit handle; if internal → skip (child visited at next
|
|
310
|
+
// level).
|
|
311
|
+
// - Build dense/sparse bitvectors from labels and has_child flags.
|
|
312
|
+
// =========================================================================
|
|
313
|
+
|
|
314
|
+
d_labels_ = BitvectorBuilder();
|
|
315
|
+
d_has_child_ = BitvectorBuilder();
|
|
316
|
+
d_is_prefix_key_ = BitvectorBuilder();
|
|
317
|
+
dense_node_count_ = 0;
|
|
318
|
+
dense_child_count_ = 0;
|
|
319
|
+
|
|
320
|
+
s_labels_.clear();
|
|
321
|
+
s_has_child_ = BitvectorBuilder();
|
|
322
|
+
s_louds_ = BitvectorBuilder();
|
|
323
|
+
s_is_prefix_key_ = BitvectorBuilder();
|
|
324
|
+
|
|
325
|
+
dense_leaf_count_ = 0;
|
|
326
|
+
|
|
327
|
+
std::vector<TrieBlockHandle> bfs_ordered_handles;
|
|
328
|
+
bfs_ordered_handles.reserve(keys_.size());
|
|
329
|
+
// BFS-ordered seqno side-table data (same reordering as handles).
|
|
330
|
+
// Only used when has_seqno_encoding_ is true.
|
|
331
|
+
std::vector<uint64_t> bfs_ordered_seqnos;
|
|
332
|
+
std::vector<uint32_t> bfs_ordered_block_counts;
|
|
333
|
+
// Overflow arrays must also be BFS-reordered. Without this, the
|
|
334
|
+
// overflow_base_ prefix sum (computed from BFS-ordered block_counts)
|
|
335
|
+
// would index into key-sorted overflow arrays, mapping overflow blocks
|
|
336
|
+
// to the wrong leaves when BFS order differs from key-sorted order
|
|
337
|
+
// (which happens whenever separator keys have different lengths).
|
|
338
|
+
std::vector<uint32_t> key_sorted_overflow_base;
|
|
339
|
+
std::vector<TrieBlockHandle> bfs_ordered_overflow_handles;
|
|
340
|
+
std::vector<uint64_t> bfs_ordered_overflow_seqnos;
|
|
341
|
+
if (has_seqno_encoding_) {
|
|
342
|
+
bfs_ordered_seqnos.reserve(keys_.size());
|
|
343
|
+
bfs_ordered_block_counts.reserve(keys_.size());
|
|
344
|
+
if (!overflow_handles_.empty()) {
|
|
345
|
+
key_sorted_overflow_base.resize(keys_.size());
|
|
346
|
+
uint32_t sum = 0;
|
|
347
|
+
for (size_t i = 0; i < keys_.size(); i++) {
|
|
348
|
+
key_sorted_overflow_base[i] = sum;
|
|
349
|
+
assert(block_counts_[i] >= 1);
|
|
350
|
+
sum += block_counts_[i] - 1;
|
|
351
|
+
}
|
|
352
|
+
assert(sum == static_cast<uint32_t>(overflow_handles_.size()));
|
|
353
|
+
bfs_ordered_overflow_handles.reserve(overflow_handles_.size());
|
|
354
|
+
bfs_ordered_overflow_seqnos.reserve(overflow_seqnos_.size());
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Emit BFS-ordered data for a leaf with key index ki: primary handle,
|
|
359
|
+
// seqno side-table fields, and overflow blocks (if any).
|
|
360
|
+
auto emit_leaf = [&](size_t ki) {
|
|
361
|
+
bfs_ordered_handles.push_back(handles_[ki]);
|
|
362
|
+
if (has_seqno_encoding_) {
|
|
363
|
+
bfs_ordered_seqnos.push_back(seqnos_[ki]);
|
|
364
|
+
bfs_ordered_block_counts.push_back(block_counts_[ki]);
|
|
365
|
+
if (!key_sorted_overflow_base.empty() && block_counts_[ki] > 1) {
|
|
366
|
+
uint32_t base = key_sorted_overflow_base[ki];
|
|
367
|
+
uint32_t count = block_counts_[ki] - 1;
|
|
368
|
+
for (uint32_t j = 0; j < count; j++) {
|
|
369
|
+
bfs_ordered_overflow_handles.push_back(overflow_handles_[base + j]);
|
|
370
|
+
bfs_ordered_overflow_seqnos.push_back(overflow_seqnos_[base + j]);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
for (uint32_t level = 0; level <= max_depth_; level++) {
|
|
377
|
+
const auto& ld = levels[level];
|
|
378
|
+
if (ld.node_count() == 0) {
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
bool is_dense = (level < cutoff_level_);
|
|
383
|
+
|
|
384
|
+
for (uint64_t ni = 0; ni < ld.node_count(); ni++) {
|
|
385
|
+
// Determine label range for this node.
|
|
386
|
+
uint64_t label_start = ld.node_label_start[ni];
|
|
387
|
+
uint64_t label_end = (ni + 1 < ld.node_count())
|
|
388
|
+
? ld.node_label_start[ni + 1]
|
|
389
|
+
: static_cast<uint64_t>(ld.labels.size());
|
|
390
|
+
|
|
391
|
+
// ---- Handle reordering: emit prefix key handle ----
|
|
392
|
+
if (ld.is_prefix[ni] && ld.prefix_handle[ni] >= 0) {
|
|
393
|
+
emit_leaf(static_cast<size_t>(ld.prefix_handle[ni]));
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Skip pure leaf nodes (no labels) — they are accounted for by
|
|
397
|
+
// has_child=0 in their parent. They don't produce LOUDS entries.
|
|
398
|
+
if (label_start == label_end) {
|
|
399
|
+
if (ld.is_prefix[ni]) {
|
|
400
|
+
// Prefix key node with no children at THIS level (but it was
|
|
401
|
+
// already counted above). This can happen for nodes created by
|
|
402
|
+
// lazy creation that never gained children.
|
|
403
|
+
if (is_dense) {
|
|
404
|
+
dense_leaf_count_++;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
if (is_dense) {
|
|
411
|
+
// ---- Dense encoding ----
|
|
412
|
+
dense_node_count_++;
|
|
413
|
+
d_is_prefix_key_.Append(ld.is_prefix[ni]);
|
|
414
|
+
|
|
415
|
+
// Build 256-bit label bitmap as 4 x 64-bit words.
|
|
416
|
+
uint64_t bitmap[4] = {};
|
|
417
|
+
for (uint64_t li = label_start; li < label_end; li++) {
|
|
418
|
+
uint8_t label = ld.labels[li];
|
|
419
|
+
bitmap[label / 64] |= uint64_t(1) << (label % 64);
|
|
420
|
+
}
|
|
421
|
+
for (int w = 0; w < 4; w++) {
|
|
422
|
+
d_labels_.AppendWord(bitmap[w], 64);
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Emit has_child bits, count leaves, emit leaf handles.
|
|
426
|
+
for (uint64_t li = label_start; li < label_end; li++) {
|
|
427
|
+
bool is_internal = ld.has_child[li];
|
|
428
|
+
d_has_child_.Append(is_internal);
|
|
429
|
+
|
|
430
|
+
if (!is_internal) {
|
|
431
|
+
if (ld.leaf_handle[li] >= 0) {
|
|
432
|
+
emit_leaf(static_cast<size_t>(ld.leaf_handle[li]));
|
|
433
|
+
}
|
|
434
|
+
dense_leaf_count_++;
|
|
435
|
+
} else if (level == cutoff_level_ - 1) {
|
|
436
|
+
dense_child_count_++;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
if (ld.is_prefix[ni]) {
|
|
441
|
+
dense_leaf_count_++;
|
|
442
|
+
}
|
|
443
|
+
} else {
|
|
444
|
+
// ---- Sparse encoding ----
|
|
445
|
+
s_is_prefix_key_.Append(ld.is_prefix[ni]);
|
|
446
|
+
bool first_label = true;
|
|
447
|
+
for (uint64_t li = label_start; li < label_end; li++) {
|
|
448
|
+
bool is_internal = ld.has_child[li];
|
|
449
|
+
|
|
450
|
+
s_labels_.push_back(ld.labels[li]);
|
|
451
|
+
s_has_child_.Append(is_internal);
|
|
452
|
+
s_louds_.Append(first_label);
|
|
453
|
+
first_label = false;
|
|
454
|
+
|
|
455
|
+
if (!is_internal) {
|
|
456
|
+
if (ld.leaf_handle[li] >= 0) {
|
|
457
|
+
emit_leaf(static_cast<size_t>(ld.leaf_handle[li]));
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// When cutoff_level_ = 0, all nodes are sparse and the root is the sole
|
|
466
|
+
// "root sparse node" (not a child of any dense level).
|
|
467
|
+
if (cutoff_level_ == 0) {
|
|
468
|
+
dense_child_count_ = 1;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
assert(bfs_ordered_handles.size() == keys_.size());
|
|
472
|
+
handles_ = std::move(bfs_ordered_handles);
|
|
473
|
+
if (has_seqno_encoding_) {
|
|
474
|
+
assert(bfs_ordered_seqnos.size() == keys_.size());
|
|
475
|
+
assert(bfs_ordered_block_counts.size() == keys_.size());
|
|
476
|
+
seqnos_ = std::move(bfs_ordered_seqnos);
|
|
477
|
+
block_counts_ = std::move(bfs_ordered_block_counts);
|
|
478
|
+
if (!overflow_handles_.empty()) {
|
|
479
|
+
assert(bfs_ordered_overflow_handles.size() == overflow_handles_.size());
|
|
480
|
+
assert(bfs_ordered_overflow_seqnos.size() == overflow_seqnos_.size());
|
|
481
|
+
overflow_handles_ = std::move(bfs_ordered_overflow_handles);
|
|
482
|
+
overflow_seqnos_ = std::move(bfs_ordered_overflow_seqnos);
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
SerializeAll();
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
void LoudsTrieBuilder::SerializeAll() {
|
|
490
|
+
serialized_data_.clear();
|
|
491
|
+
|
|
492
|
+
PutFixed32(&serialized_data_, kTrieMagic);
|
|
493
|
+
PutFixed32(&serialized_data_, kTrieFormatVersion);
|
|
494
|
+
PutFixed64(&serialized_data_, keys_.size());
|
|
495
|
+
PutFixed32(&serialized_data_, cutoff_level_);
|
|
496
|
+
PutFixed32(&serialized_data_, max_depth_);
|
|
497
|
+
PutFixed64(&serialized_data_, dense_leaf_count_);
|
|
498
|
+
PutFixed64(&serialized_data_, dense_node_count_);
|
|
499
|
+
PutFixed64(&serialized_data_, dense_child_count_);
|
|
500
|
+
|
|
501
|
+
// Flags field. Contains feature bits that inform the reader about
|
|
502
|
+
// encoding decisions made during building.
|
|
503
|
+
uint32_t flags = 0;
|
|
504
|
+
if (has_seqno_encoding_) {
|
|
505
|
+
flags |= kFlagSeqnoEncoding;
|
|
506
|
+
}
|
|
507
|
+
PutFixed32(&serialized_data_, flags);
|
|
508
|
+
// 4 bytes of reserved padding to maintain 8-byte alignment (56-byte header).
|
|
509
|
+
PutFixed32(&serialized_data_, 0);
|
|
510
|
+
|
|
511
|
+
// Dense section.
|
|
512
|
+
{
|
|
513
|
+
Bitvector bv;
|
|
514
|
+
bv.BuildFrom(d_labels_);
|
|
515
|
+
bv.EncodeTo(&serialized_data_);
|
|
516
|
+
}
|
|
517
|
+
{
|
|
518
|
+
Bitvector bv;
|
|
519
|
+
bv.BuildFrom(d_has_child_);
|
|
520
|
+
bv.EncodeTo(&serialized_data_);
|
|
521
|
+
}
|
|
522
|
+
{
|
|
523
|
+
Bitvector bv;
|
|
524
|
+
bv.BuildFrom(d_is_prefix_key_);
|
|
525
|
+
bv.EncodeTo(&serialized_data_);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Sparse section.
|
|
529
|
+
uint64_t sl_size = s_labels_.size();
|
|
530
|
+
PutFixed64(&serialized_data_, sl_size);
|
|
531
|
+
if (sl_size > 0) {
|
|
532
|
+
serialized_data_.append(reinterpret_cast<const char*>(s_labels_.data()),
|
|
533
|
+
sl_size);
|
|
534
|
+
size_t padding = (8 - (sl_size % 8)) % 8;
|
|
535
|
+
serialized_data_.append(padding, '\0');
|
|
536
|
+
}
|
|
537
|
+
Bitvector bv_s_has_child;
|
|
538
|
+
bv_s_has_child.BuildFrom(s_has_child_);
|
|
539
|
+
bv_s_has_child.EncodeTo(&serialized_data_);
|
|
540
|
+
|
|
541
|
+
Bitvector bv_s_louds;
|
|
542
|
+
bv_s_louds.BuildFrom(s_louds_);
|
|
543
|
+
bv_s_louds.EncodeTo(&serialized_data_);
|
|
544
|
+
|
|
545
|
+
{
|
|
546
|
+
Bitvector bv;
|
|
547
|
+
bv.BuildFrom(s_is_prefix_key_);
|
|
548
|
+
bv.EncodeTo(&serialized_data_);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// Child position lookup tables for Select-free sparse traversal.
|
|
552
|
+
// Compute and serialize: s_child_start_pos_[k] and s_child_end_pos_[k]
|
|
553
|
+
// for each internal label (has_child=1).
|
|
554
|
+
uint64_t num_internal = bv_s_has_child.NumOnes();
|
|
555
|
+
{
|
|
556
|
+
PutFixed64(&serialized_data_, num_internal);
|
|
557
|
+
|
|
558
|
+
if (num_internal > 0 && sl_size <= UINT32_MAX) {
|
|
559
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
560
|
+
uint64_t child_node = dense_child_count_ + k;
|
|
561
|
+
uint64_t child_start = bv_s_louds.FindNthOneBit(child_node);
|
|
562
|
+
PutFixed32(&serialized_data_, static_cast<uint32_t>(child_start));
|
|
563
|
+
}
|
|
564
|
+
// Pad to 8-byte alignment
|
|
565
|
+
size_t bytes_written = num_internal * sizeof(uint32_t);
|
|
566
|
+
size_t padding = (8 - (bytes_written % 8)) % 8;
|
|
567
|
+
serialized_data_.append(padding, '\0');
|
|
568
|
+
|
|
569
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
570
|
+
uint64_t child_node = dense_child_count_ + k;
|
|
571
|
+
uint64_t child_start = bv_s_louds.FindNthOneBit(child_node);
|
|
572
|
+
uint64_t child_end = bv_s_louds.NextSetBit(child_start + 1);
|
|
573
|
+
if (child_end > sl_size) {
|
|
574
|
+
child_end = sl_size;
|
|
575
|
+
}
|
|
576
|
+
PutFixed32(&serialized_data_, static_cast<uint32_t>(child_end));
|
|
577
|
+
}
|
|
578
|
+
// Pad to 8-byte alignment
|
|
579
|
+
bytes_written = num_internal * sizeof(uint32_t);
|
|
580
|
+
padding = (8 - (bytes_written % 8)) % 8;
|
|
581
|
+
serialized_data_.append(padding, '\0');
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
// Path compression: detect and serialize fanout-1 chains.
|
|
586
|
+
//
|
|
587
|
+
// A chain starts at the child of internal label k and consists of >= 2
|
|
588
|
+
// consecutive fanout-1 nodes (all internal except possibly the last,
|
|
589
|
+
// which may be a leaf). For each chain, we store:
|
|
590
|
+
// - suffix bytes (the label at each chain node)
|
|
591
|
+
// - chain length
|
|
592
|
+
// - the child_idx at the end of the chain (UINT32_MAX if ends at leaf)
|
|
593
|
+
//
|
|
594
|
+
// This lets Seek() skip entire chains with a single memcmp instead of
|
|
595
|
+
// traversing level by level with Rank1 at each step.
|
|
596
|
+
{
|
|
597
|
+
// Build child start/end arrays in memory for chain detection.
|
|
598
|
+
// These are the same values we just serialized above.
|
|
599
|
+
std::vector<uint32_t> child_starts(num_internal);
|
|
600
|
+
std::vector<uint32_t> child_ends(num_internal);
|
|
601
|
+
if (num_internal > 0 && sl_size <= UINT32_MAX) {
|
|
602
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
603
|
+
uint64_t child_node = dense_child_count_ + k;
|
|
604
|
+
uint64_t cs = bv_s_louds.FindNthOneBit(child_node);
|
|
605
|
+
child_starts[k] = static_cast<uint32_t>(cs);
|
|
606
|
+
uint64_t ce = bv_s_louds.NextSetBit(cs + 1);
|
|
607
|
+
if (ce > sl_size) {
|
|
608
|
+
ce = sl_size;
|
|
609
|
+
}
|
|
610
|
+
child_ends[k] = static_cast<uint32_t>(ce);
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
// For each internal label k, detect if its child starts a chain.
|
|
615
|
+
std::string chain_suffix_data;
|
|
616
|
+
// Offsets into chain_suffix_data for each internal label.
|
|
617
|
+
// UINT32_MAX means no chain.
|
|
618
|
+
std::vector<uint32_t> chain_offsets(num_internal, UINT32_MAX);
|
|
619
|
+
std::vector<uint16_t> chain_lens(num_internal, 0);
|
|
620
|
+
// child_idx at chain end (UINT32_MAX = leaf).
|
|
621
|
+
std::vector<uint32_t> chain_end_child_idx(num_internal, UINT32_MAX);
|
|
622
|
+
|
|
623
|
+
// Build s_is_prefix_key bitvector for prefix key checks during chain
|
|
624
|
+
// detection. Chains must not contain prefix key nodes because the chain
|
|
625
|
+
// skip logic bypasses prefix key checks.
|
|
626
|
+
Bitvector bv_s_is_prefix_key;
|
|
627
|
+
bv_s_is_prefix_key.BuildFrom(s_is_prefix_key_);
|
|
628
|
+
|
|
629
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
630
|
+
uint32_t cs = child_starts[k];
|
|
631
|
+
uint32_t ce = child_ends[k];
|
|
632
|
+
|
|
633
|
+
// Child must be fanout-1 (single label).
|
|
634
|
+
if (ce - cs != 1) {
|
|
635
|
+
continue;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
// Check if the child's label is internal (has_child = 1).
|
|
639
|
+
if (!bv_s_has_child.GetBit(cs)) {
|
|
640
|
+
continue;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// Check if the child node is a prefix key — if so, cannot skip it.
|
|
644
|
+
{
|
|
645
|
+
uint64_t child_sparse_node = bv_s_louds.Rank1(cs + 1) - 1;
|
|
646
|
+
if (child_sparse_node < bv_s_is_prefix_key.NumBits() &&
|
|
647
|
+
bv_s_is_prefix_key.GetBit(child_sparse_node)) {
|
|
648
|
+
continue;
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Found a fanout-1 internal node. Follow the chain.
|
|
653
|
+
std::vector<uint8_t> suffix;
|
|
654
|
+
suffix.push_back(s_labels_[cs]);
|
|
655
|
+
|
|
656
|
+
// Get the child_idx of this chain node.
|
|
657
|
+
uint64_t cur_child_idx = bv_s_has_child.Rank1(cs + 1) - 1;
|
|
658
|
+
uint32_t last_child_idx =
|
|
659
|
+
UINT32_MAX; // Will be set to chain end's child.
|
|
660
|
+
|
|
661
|
+
while (true) {
|
|
662
|
+
// cur_child_idx is the index of the current chain node's internal
|
|
663
|
+
// label.
|
|
664
|
+
if (cur_child_idx >= num_internal) {
|
|
665
|
+
break;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
uint32_t next_cs = child_starts[cur_child_idx];
|
|
669
|
+
uint32_t next_ce = child_ends[cur_child_idx];
|
|
670
|
+
|
|
671
|
+
if (next_ce - next_cs != 1) {
|
|
672
|
+
// Child has fanout > 1. Chain ends here at an internal node.
|
|
673
|
+
last_child_idx = static_cast<uint32_t>(cur_child_idx);
|
|
674
|
+
break;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// Check if next node is a prefix key — stop chain here.
|
|
678
|
+
{
|
|
679
|
+
uint64_t next_sparse_node = bv_s_louds.Rank1(next_cs + 1) - 1;
|
|
680
|
+
if (next_sparse_node < bv_s_is_prefix_key.NumBits() &&
|
|
681
|
+
bv_s_is_prefix_key.GetBit(next_sparse_node)) {
|
|
682
|
+
// End chain before this prefix key node.
|
|
683
|
+
last_child_idx = static_cast<uint32_t>(cur_child_idx);
|
|
684
|
+
break;
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
// Next child is also fanout-1. Check if internal or leaf.
|
|
689
|
+
suffix.push_back(s_labels_[next_cs]);
|
|
690
|
+
|
|
691
|
+
if (!bv_s_has_child.GetBit(next_cs)) {
|
|
692
|
+
// Chain ends at a leaf.
|
|
693
|
+
last_child_idx = UINT32_MAX;
|
|
694
|
+
break;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
// Continue chaining.
|
|
698
|
+
cur_child_idx = bv_s_has_child.Rank1(next_cs + 1) - 1;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// Only store chains of meaningful length. Short chains don't save
|
|
702
|
+
// enough Rank1 calls to justify the metadata overhead (10 bytes per
|
|
703
|
+
// chain + suffix bytes).
|
|
704
|
+
static constexpr size_t kMinChainLength = 8;
|
|
705
|
+
if (suffix.size() >= kMinChainLength && suffix.size() <= UINT16_MAX) {
|
|
706
|
+
chain_offsets[k] = static_cast<uint32_t>(chain_suffix_data.size());
|
|
707
|
+
chain_lens[k] = static_cast<uint16_t>(suffix.size());
|
|
708
|
+
chain_end_child_idx[k] = last_child_idx;
|
|
709
|
+
chain_suffix_data.append(reinterpret_cast<const char*>(suffix.data()),
|
|
710
|
+
suffix.size());
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
// Chain cost/benefit filter: only emit chains when they provide a net
|
|
715
|
+
// speed benefit.
|
|
716
|
+
//
|
|
717
|
+
// Key insight: chains help when a Seek is *likely* to hit a chain.
|
|
718
|
+
// For tries with few chains relative to keys (e.g., numeric keys with
|
|
719
|
+
// a single long shared-prefix chain), every Seek benefits. For tries
|
|
720
|
+
// with many chains relative to keys (e.g., random hex with thousands
|
|
721
|
+
// of short chains), each chain is rarely hit and the per-level bitmap
|
|
722
|
+
// check overhead outweighs the occasional hit.
|
|
723
|
+
//
|
|
724
|
+
// Metric: emit chains only when num_chains <= num_keys. When there are
|
|
725
|
+
// more chains than keys, most chains won't be hit by any Seek, making
|
|
726
|
+
// the bitmap overhead a net loss. Additionally, apply a space budget
|
|
727
|
+
// (10% of base trie size) to prevent excessive metadata.
|
|
728
|
+
{
|
|
729
|
+
static constexpr double kChainBudgetPct = 0.10;
|
|
730
|
+
static constexpr size_t kPerChainMetaBytes = 10; // 4 + 2 + 4
|
|
731
|
+
|
|
732
|
+
uint64_t candidate_count = 0;
|
|
733
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
734
|
+
if (chain_lens[k] > 0) {
|
|
735
|
+
candidate_count++;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
uint64_t num_keys = handles_.size();
|
|
740
|
+
bool too_many_chains = (candidate_count > num_keys);
|
|
741
|
+
|
|
742
|
+
if (candidate_count == 0 || too_many_chains) {
|
|
743
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
744
|
+
chain_lens[k] = 0;
|
|
745
|
+
}
|
|
746
|
+
} else {
|
|
747
|
+
// Apply space budget to prevent excessive metadata even when
|
|
748
|
+
// the chain count is reasonable.
|
|
749
|
+
size_t base_trie_size = serialized_data_.size();
|
|
750
|
+
size_t budget = static_cast<size_t>(base_trie_size * kChainBudgetPct);
|
|
751
|
+
size_t bitmap_fixed_cost =
|
|
752
|
+
(num_internal + 7) / 8 + (num_internal / 512 + 1) * 8;
|
|
753
|
+
|
|
754
|
+
if (budget <= bitmap_fixed_cost) {
|
|
755
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
756
|
+
chain_lens[k] = 0;
|
|
757
|
+
}
|
|
758
|
+
} else {
|
|
759
|
+
size_t available = budget - bitmap_fixed_cost;
|
|
760
|
+
size_t total_cost = 0;
|
|
761
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
762
|
+
if (chain_lens[k] > 0) {
|
|
763
|
+
total_cost += kPerChainMetaBytes + chain_lens[k];
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
if (total_cost > available) {
|
|
768
|
+
// Over budget. Keep longest chains first.
|
|
769
|
+
struct ChainCandidate {
|
|
770
|
+
uint64_t idx;
|
|
771
|
+
size_t cost;
|
|
772
|
+
uint16_t length;
|
|
773
|
+
};
|
|
774
|
+
std::vector<ChainCandidate> candidates;
|
|
775
|
+
candidates.reserve(candidate_count);
|
|
776
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
777
|
+
if (chain_lens[k] > 0) {
|
|
778
|
+
candidates.push_back(
|
|
779
|
+
{k, kPerChainMetaBytes + chain_lens[k], chain_lens[k]});
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
std::sort(candidates.begin(), candidates.end(),
|
|
783
|
+
[](const ChainCandidate& a, const ChainCandidate& b) {
|
|
784
|
+
return a.length > b.length;
|
|
785
|
+
});
|
|
786
|
+
|
|
787
|
+
std::vector<bool> keep(num_internal, false);
|
|
788
|
+
size_t used = 0;
|
|
789
|
+
for (const auto& c : candidates) {
|
|
790
|
+
if (used + c.cost <= available) {
|
|
791
|
+
keep[c.idx] = true;
|
|
792
|
+
used += c.cost;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
std::string new_suffix_data;
|
|
797
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
798
|
+
if (chain_lens[k] > 0 && !keep[k]) {
|
|
799
|
+
chain_lens[k] = 0;
|
|
800
|
+
chain_offsets[k] = UINT32_MAX;
|
|
801
|
+
chain_end_child_idx[k] = UINT32_MAX;
|
|
802
|
+
} else if (chain_lens[k] > 0) {
|
|
803
|
+
uint32_t old_off = chain_offsets[k];
|
|
804
|
+
chain_offsets[k] =
|
|
805
|
+
static_cast<uint32_t>(new_suffix_data.size());
|
|
806
|
+
new_suffix_data.append(chain_suffix_data.data() + old_off,
|
|
807
|
+
chain_lens[k]);
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
chain_suffix_data = std::move(new_suffix_data);
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
// Count actual chains and build bitmap + compact arrays.
|
|
817
|
+
uint64_t num_chains = 0;
|
|
818
|
+
BitvectorBuilder chain_bitmap_builder;
|
|
819
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
820
|
+
bool has_chain = (chain_lens[k] > 0);
|
|
821
|
+
chain_bitmap_builder.Append(has_chain);
|
|
822
|
+
if (has_chain) {
|
|
823
|
+
num_chains++;
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// Serialize: num_chains, then bitmap + compact arrays if any.
|
|
828
|
+
PutFixed64(&serialized_data_, num_chains);
|
|
829
|
+
|
|
830
|
+
if (num_chains > 0) {
|
|
831
|
+
// Write chain bitmap (1 bit per internal label).
|
|
832
|
+
Bitvector chain_bitmap_bv;
|
|
833
|
+
chain_bitmap_bv.BuildFrom(chain_bitmap_builder);
|
|
834
|
+
chain_bitmap_bv.EncodeTo(&serialized_data_);
|
|
835
|
+
|
|
836
|
+
// Write compact chain_offsets (uint32_t per chain).
|
|
837
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
838
|
+
if (chain_lens[k] > 0) {
|
|
839
|
+
PutFixed32(&serialized_data_, chain_offsets[k]);
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
size_t bytes_written = num_chains * sizeof(uint32_t);
|
|
843
|
+
size_t padding = (8 - (bytes_written % 8)) % 8;
|
|
844
|
+
serialized_data_.append(padding, '\0');
|
|
845
|
+
|
|
846
|
+
// Write compact chain_lens (uint16_t per chain).
|
|
847
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
848
|
+
if (chain_lens[k] > 0) {
|
|
849
|
+
serialized_data_.append(reinterpret_cast<const char*>(&chain_lens[k]),
|
|
850
|
+
sizeof(uint16_t));
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
bytes_written = num_chains * sizeof(uint16_t);
|
|
854
|
+
padding = (8 - (bytes_written % 8)) % 8;
|
|
855
|
+
serialized_data_.append(padding, '\0');
|
|
856
|
+
|
|
857
|
+
// Write compact chain_end_child_idx (uint32_t per chain).
|
|
858
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
859
|
+
if (chain_lens[k] > 0) {
|
|
860
|
+
PutFixed32(&serialized_data_, chain_end_child_idx[k]);
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
bytes_written = num_chains * sizeof(uint32_t);
|
|
864
|
+
padding = (8 - (bytes_written % 8)) % 8;
|
|
865
|
+
serialized_data_.append(padding, '\0');
|
|
866
|
+
|
|
867
|
+
// Write suffix data blob.
|
|
868
|
+
uint64_t chain_data_size = chain_suffix_data.size();
|
|
869
|
+
PutFixed64(&serialized_data_, chain_data_size);
|
|
870
|
+
serialized_data_.append(chain_suffix_data);
|
|
871
|
+
padding = (8 - (chain_data_size % 8)) % 8;
|
|
872
|
+
serialized_data_.append(padding, '\0');
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
// Block handles: packed uint32_t arrays for offsets and sizes.
|
|
877
|
+
// BFS leaf order does not match key-sorted order for keys of different
|
|
878
|
+
// lengths, so offsets are NOT monotone and cannot use Elias-Fano.
|
|
879
|
+
{
|
|
880
|
+
if (!handles_.empty()) {
|
|
881
|
+
size_t n = handles_.size();
|
|
882
|
+
// Write offsets array (uint32_t, padded to 8-byte alignment).
|
|
883
|
+
for (size_t i = 0; i < n; i++) {
|
|
884
|
+
assert(handles_[i].offset <= UINT32_MAX);
|
|
885
|
+
PutFixed32(&serialized_data_,
|
|
886
|
+
static_cast<uint32_t>(handles_[i].offset));
|
|
887
|
+
}
|
|
888
|
+
size_t bytes_written = n * sizeof(uint32_t);
|
|
889
|
+
size_t padding = (8 - (bytes_written % 8)) % 8;
|
|
890
|
+
serialized_data_.append(padding, '\0');
|
|
891
|
+
|
|
892
|
+
// Write sizes array (uint32_t, padded to 8-byte alignment).
|
|
893
|
+
for (size_t i = 0; i < n; i++) {
|
|
894
|
+
assert(handles_[i].size <= UINT32_MAX);
|
|
895
|
+
PutFixed32(&serialized_data_, static_cast<uint32_t>(handles_[i].size));
|
|
896
|
+
}
|
|
897
|
+
bytes_written = n * sizeof(uint32_t);
|
|
898
|
+
padding = (8 - (bytes_written % 8)) % 8;
|
|
899
|
+
serialized_data_.append(padding, '\0');
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
// =========================================================================
|
|
904
|
+
// Seqno side-table: serialized after handle arrays when kFlagSeqnoEncoding
|
|
905
|
+
// is set. Contains per-leaf seqno/block_count data in BFS leaf order, plus
|
|
906
|
+
// overflow block handles/seqnos for same-user-key runs.
|
|
907
|
+
//
|
|
908
|
+
// The trie always stores user-key-only separators. When same-user-key
|
|
909
|
+
// boundaries exist, duplicate separator keys are de-duplicated (only the
|
|
910
|
+
// first occurrence goes in the trie). Overflow blocks (the 2nd, 3rd, ...
|
|
911
|
+
// blocks in a same-key run) are stored here in the side-table. After a
|
|
912
|
+
// trie Seek lands on a leaf, the iterator uses the seqno data to determine
|
|
913
|
+
// whether to advance through overflow blocks.
|
|
914
|
+
//
|
|
915
|
+
// Format:
|
|
916
|
+
// num_overflow_blocks: uint32_t
|
|
917
|
+
// padding: uint32_t (0, for 8-byte alignment)
|
|
918
|
+
// leaf_seqnos: uint64_t[num_keys] (BFS order)
|
|
919
|
+
// leaf_block_counts: uint32_t[num_keys] (BFS order, padded to 8-byte)
|
|
920
|
+
// overflow_offsets: uint32_t[num_overflow] (padded to 8-byte)
|
|
921
|
+
// overflow_sizes: uint32_t[num_overflow] (padded to 8-byte)
|
|
922
|
+
// overflow_seqnos: uint64_t[num_overflow]
|
|
923
|
+
// =========================================================================
|
|
924
|
+
if (has_seqno_encoding_) {
|
|
925
|
+
uint32_t num_overflow = static_cast<uint32_t>(overflow_handles_.size());
|
|
926
|
+
PutFixed32(&serialized_data_, num_overflow);
|
|
927
|
+
PutFixed32(&serialized_data_, 0); // padding for 8-byte alignment
|
|
928
|
+
|
|
929
|
+
size_t n = handles_.size();
|
|
930
|
+
|
|
931
|
+
// leaf_seqnos: uint64_t per leaf (BFS order)
|
|
932
|
+
for (size_t i = 0; i < n; i++) {
|
|
933
|
+
PutFixed64(&serialized_data_, seqnos_[i]);
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
// leaf_block_counts: uint32_t per leaf (BFS order, padded)
|
|
937
|
+
for (size_t i = 0; i < n; i++) {
|
|
938
|
+
PutFixed32(&serialized_data_, block_counts_[i]);
|
|
939
|
+
}
|
|
940
|
+
{
|
|
941
|
+
size_t bytes_written = n * sizeof(uint32_t);
|
|
942
|
+
size_t padding = (8 - (bytes_written % 8)) % 8;
|
|
943
|
+
serialized_data_.append(padding, '\0');
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
// overflow_offsets: uint32_t per overflow (padded)
|
|
947
|
+
for (uint32_t i = 0; i < num_overflow; i++) {
|
|
948
|
+
assert(overflow_handles_[i].offset <= UINT32_MAX);
|
|
949
|
+
PutFixed32(&serialized_data_,
|
|
950
|
+
static_cast<uint32_t>(overflow_handles_[i].offset));
|
|
951
|
+
}
|
|
952
|
+
if (num_overflow > 0) {
|
|
953
|
+
size_t bytes_written = num_overflow * sizeof(uint32_t);
|
|
954
|
+
size_t padding = (8 - (bytes_written % 8)) % 8;
|
|
955
|
+
serialized_data_.append(padding, '\0');
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
// overflow_sizes: uint32_t per overflow (padded)
|
|
959
|
+
for (uint32_t i = 0; i < num_overflow; i++) {
|
|
960
|
+
assert(overflow_handles_[i].size <= UINT32_MAX);
|
|
961
|
+
PutFixed32(&serialized_data_,
|
|
962
|
+
static_cast<uint32_t>(overflow_handles_[i].size));
|
|
963
|
+
}
|
|
964
|
+
if (num_overflow > 0) {
|
|
965
|
+
size_t bytes_written = num_overflow * sizeof(uint32_t);
|
|
966
|
+
size_t padding = (8 - (bytes_written % 8)) % 8;
|
|
967
|
+
serialized_data_.append(padding, '\0');
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
// overflow_seqnos: uint64_t per overflow (naturally 8-byte aligned)
|
|
971
|
+
for (uint32_t i = 0; i < num_overflow; i++) {
|
|
972
|
+
PutFixed64(&serialized_data_, overflow_seqnos_[i]);
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
// ============================================================================
|
|
978
|
+
// LoudsTrie implementation
|
|
979
|
+
// ============================================================================
|
|
980
|
+
|
|
981
|
+
LoudsTrie::LoudsTrie()
|
|
982
|
+
: num_keys_(0),
|
|
983
|
+
cutoff_level_(0),
|
|
984
|
+
max_depth_(0),
|
|
985
|
+
has_seqno_encoding_(false),
|
|
986
|
+
dense_leaf_count_(0),
|
|
987
|
+
dense_node_count_(0),
|
|
988
|
+
dense_child_count_(0),
|
|
989
|
+
s_labels_data_(nullptr),
|
|
990
|
+
s_labels_size_(0),
|
|
991
|
+
s_chain_suffix_data_(nullptr),
|
|
992
|
+
s_chain_suffix_size_(0),
|
|
993
|
+
handle_offsets_(nullptr),
|
|
994
|
+
handle_sizes_(nullptr),
|
|
995
|
+
leaf_seqnos_(nullptr),
|
|
996
|
+
leaf_block_counts_(nullptr),
|
|
997
|
+
overflow_offsets_(nullptr),
|
|
998
|
+
overflow_sizes_(nullptr),
|
|
999
|
+
overflow_seqnos_(nullptr),
|
|
1000
|
+
num_overflow_blocks_(0) {}
|
|
1001
|
+
|
|
1002
|
+
Status LoudsTrie::InitFromData(const Slice& data) {
|
|
1003
|
+
const char* p = data.data();
|
|
1004
|
+
size_t remaining = data.size();
|
|
1005
|
+
|
|
1006
|
+
// The trie data contains bitvectors with uint64_t arrays and handle arrays
|
|
1007
|
+
// with uint32_t entries, all accessed via reinterpret_cast pointers that
|
|
1008
|
+
// require proper alignment. Block buffers from heap/cache allocations are
|
|
1009
|
+
// typically aligned, but mmap'd data or other sources may not be. If the
|
|
1010
|
+
// data is not 8-byte aligned, copy it into an owned aligned buffer.
|
|
1011
|
+
// std::string::data() returns memory from new[]/malloc, which is aligned
|
|
1012
|
+
// to at least alignof(max_align_t) (>= 8 on all supported platforms).
|
|
1013
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint64_t) != 0) {
|
|
1014
|
+
aligned_copy_.assign(p, remaining);
|
|
1015
|
+
p = aligned_copy_.data();
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
// Header: magic(4) + version(4) + num_keys(8) + cutoff_level(4) +
|
|
1019
|
+
// max_depth(4) + dense_leaf_count(8) + dense_node_count(8) +
|
|
1020
|
+
// dense_child_count(8) + flags(4) + reserved(4) = 56.
|
|
1021
|
+
if (remaining < 56) {
|
|
1022
|
+
return Status::Corruption("Trie index: data too short for header");
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
uint32_t magic;
|
|
1026
|
+
memcpy(&magic, p, 4);
|
|
1027
|
+
p += 4;
|
|
1028
|
+
remaining -= 4;
|
|
1029
|
+
if (magic != kTrieMagic) {
|
|
1030
|
+
return Status::Corruption("Trie index: bad magic number");
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
uint32_t version;
|
|
1034
|
+
memcpy(&version, p, 4);
|
|
1035
|
+
p += 4;
|
|
1036
|
+
remaining -= 4;
|
|
1037
|
+
if (version != kTrieFormatVersion) {
|
|
1038
|
+
return Status::Corruption("Trie index: unsupported format version");
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
memcpy(&num_keys_, p, 8);
|
|
1042
|
+
p += 8;
|
|
1043
|
+
remaining -= 8;
|
|
1044
|
+
|
|
1045
|
+
// Validate num_keys_ early, before any arithmetic that multiplies it by
|
|
1046
|
+
// sizeof(uint32_t) or sizeof(uint64_t). Without this check, a crafted
|
|
1047
|
+
// num_keys_ near SIZE_MAX / 4 causes silent size_t overflow in the handle
|
|
1048
|
+
// array and seqno side-table parsing below, leading to buffer overreads.
|
|
1049
|
+
// A trie with > 2^30 leaves is unrealistic (would require terabytes of
|
|
1050
|
+
// data blocks).
|
|
1051
|
+
static constexpr uint64_t kMaxReasonableKeys = uint64_t(1) << 30;
|
|
1052
|
+
if (num_keys_ > kMaxReasonableKeys) {
|
|
1053
|
+
return Status::Corruption("Trie index: num_keys exceeds reasonable limit");
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
memcpy(&cutoff_level_, p, 4);
|
|
1057
|
+
p += 4;
|
|
1058
|
+
remaining -= 4;
|
|
1059
|
+
memcpy(&max_depth_, p, 4);
|
|
1060
|
+
p += 4;
|
|
1061
|
+
remaining -= 4;
|
|
1062
|
+
|
|
1063
|
+
// Validate max_depth_ from untrusted data. The iterator allocates a
|
|
1064
|
+
// key_buf_ of size MaxDepth()+1; if max_depth_ == UINT32_MAX, the +1
|
|
1065
|
+
// overflows uint32_t to 0, causing a zero-length allocation and subsequent
|
|
1066
|
+
// buffer overflow. A key longer than 64 KB is unrealistic for a block
|
|
1067
|
+
// index separator (RocksDB keys are typically < 1 KB).
|
|
1068
|
+
static constexpr uint32_t kMaxReasonableDepth = 65536;
|
|
1069
|
+
if (max_depth_ > kMaxReasonableDepth) {
|
|
1070
|
+
return Status::Corruption("Trie index: max_depth exceeds reasonable limit");
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
memcpy(&dense_leaf_count_, p, 8);
|
|
1074
|
+
p += 8;
|
|
1075
|
+
remaining -= 8;
|
|
1076
|
+
memcpy(&dense_node_count_, p, 8);
|
|
1077
|
+
p += 8;
|
|
1078
|
+
remaining -= 8;
|
|
1079
|
+
memcpy(&dense_child_count_, p, 8);
|
|
1080
|
+
p += 8;
|
|
1081
|
+
remaining -= 8;
|
|
1082
|
+
|
|
1083
|
+
// Read flags field.
|
|
1084
|
+
uint32_t flags;
|
|
1085
|
+
memcpy(&flags, p, 4);
|
|
1086
|
+
p += 4;
|
|
1087
|
+
remaining -= 4;
|
|
1088
|
+
has_seqno_encoding_ = (flags & kFlagSeqnoEncoding) != 0;
|
|
1089
|
+
// Skip 4-byte reserved padding.
|
|
1090
|
+
p += 4;
|
|
1091
|
+
remaining -= 4;
|
|
1092
|
+
|
|
1093
|
+
// Dense bitvectors.
|
|
1094
|
+
size_t consumed;
|
|
1095
|
+
Status s;
|
|
1096
|
+
|
|
1097
|
+
s = d_labels_.InitFromData(p, remaining, &consumed);
|
|
1098
|
+
if (!s.ok()) {
|
|
1099
|
+
return s;
|
|
1100
|
+
}
|
|
1101
|
+
p += consumed;
|
|
1102
|
+
remaining -= consumed;
|
|
1103
|
+
|
|
1104
|
+
s = d_has_child_.InitFromData(p, remaining, &consumed);
|
|
1105
|
+
if (!s.ok()) {
|
|
1106
|
+
return s;
|
|
1107
|
+
}
|
|
1108
|
+
p += consumed;
|
|
1109
|
+
remaining -= consumed;
|
|
1110
|
+
|
|
1111
|
+
s = d_is_prefix_key_.InitFromData(p, remaining, &consumed);
|
|
1112
|
+
if (!s.ok()) {
|
|
1113
|
+
return s;
|
|
1114
|
+
}
|
|
1115
|
+
p += consumed;
|
|
1116
|
+
remaining -= consumed;
|
|
1117
|
+
|
|
1118
|
+
// Validate dense section counts against bitvector sizes. These counts
|
|
1119
|
+
// were read from the header (untrusted data) and will be used for leaf
|
|
1120
|
+
// ordinal computation during traversal. Inconsistent values would cause
|
|
1121
|
+
// incorrect rank arithmetic leading to wrong leaf indices → wrong block
|
|
1122
|
+
// handles.
|
|
1123
|
+
if (dense_node_count_ > 0) {
|
|
1124
|
+
// Each dense node uses 256 bits in d_labels_.
|
|
1125
|
+
if (d_labels_.NumBits() != dense_node_count_ * 256) {
|
|
1126
|
+
return Status::Corruption(
|
|
1127
|
+
"Trie index: d_labels size inconsistent with dense_node_count");
|
|
1128
|
+
}
|
|
1129
|
+
// d_has_child_ has one bit per set bit in d_labels_ (one per label).
|
|
1130
|
+
if (d_has_child_.NumBits() != d_labels_.NumOnes()) {
|
|
1131
|
+
return Status::Corruption(
|
|
1132
|
+
"Trie index: d_has_child size inconsistent with d_labels");
|
|
1133
|
+
}
|
|
1134
|
+
// d_is_prefix_key_ has one bit per dense node.
|
|
1135
|
+
if (d_is_prefix_key_.NumBits() != dense_node_count_) {
|
|
1136
|
+
return Status::Corruption(
|
|
1137
|
+
"Trie index: d_is_prefix_key size inconsistent with "
|
|
1138
|
+
"dense_node_count");
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
if (dense_leaf_count_ > num_keys_) {
|
|
1142
|
+
return Status::Corruption("Trie index: dense_leaf_count exceeds num_keys");
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
// Sparse section.
|
|
1146
|
+
if (remaining < 8) {
|
|
1147
|
+
return Status::Corruption("Trie index: truncated sparse labels size");
|
|
1148
|
+
}
|
|
1149
|
+
memcpy(&s_labels_size_, p, 8);
|
|
1150
|
+
p += 8;
|
|
1151
|
+
remaining -= 8;
|
|
1152
|
+
|
|
1153
|
+
if (s_labels_size_ > 0) {
|
|
1154
|
+
if (remaining < s_labels_size_) {
|
|
1155
|
+
return Status::Corruption("Trie index: truncated sparse labels");
|
|
1156
|
+
}
|
|
1157
|
+
s_labels_data_ = reinterpret_cast<const uint8_t*>(p);
|
|
1158
|
+
p += s_labels_size_;
|
|
1159
|
+
remaining -= s_labels_size_;
|
|
1160
|
+
size_t padding = (8 - (s_labels_size_ % 8)) % 8;
|
|
1161
|
+
if (remaining < padding) {
|
|
1162
|
+
return Status::Corruption("Trie index: truncated sparse label padding");
|
|
1163
|
+
}
|
|
1164
|
+
p += padding;
|
|
1165
|
+
remaining -= padding;
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
s = s_has_child_.InitFromData(p, remaining, &consumed);
|
|
1169
|
+
if (!s.ok()) {
|
|
1170
|
+
return s;
|
|
1171
|
+
}
|
|
1172
|
+
p += consumed;
|
|
1173
|
+
remaining -= consumed;
|
|
1174
|
+
|
|
1175
|
+
s = s_louds_.InitFromData(p, remaining, &consumed);
|
|
1176
|
+
if (!s.ok()) {
|
|
1177
|
+
return s;
|
|
1178
|
+
}
|
|
1179
|
+
p += consumed;
|
|
1180
|
+
remaining -= consumed;
|
|
1181
|
+
|
|
1182
|
+
s = s_is_prefix_key_.InitFromData(p, remaining, &consumed);
|
|
1183
|
+
if (!s.ok()) {
|
|
1184
|
+
return s;
|
|
1185
|
+
}
|
|
1186
|
+
p += consumed;
|
|
1187
|
+
remaining -= consumed;
|
|
1188
|
+
|
|
1189
|
+
// Validate sparse bitvector sizes match s_labels_size_. Each sparse
|
|
1190
|
+
// label has exactly one bit in s_has_child_ and one bit in s_louds_.
|
|
1191
|
+
// A mismatch means the serialized data is inconsistent, which would cause
|
|
1192
|
+
// GetBit/Rank1 OOB reads during traversal.
|
|
1193
|
+
if (s_labels_size_ > 0) {
|
|
1194
|
+
if (s_has_child_.NumBits() != s_labels_size_) {
|
|
1195
|
+
return Status::Corruption(
|
|
1196
|
+
"Trie index: s_has_child size inconsistent with s_labels_size");
|
|
1197
|
+
}
|
|
1198
|
+
if (s_louds_.NumBits() != s_labels_size_) {
|
|
1199
|
+
return Status::Corruption(
|
|
1200
|
+
"Trie index: s_louds size inconsistent with s_labels_size");
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
// Child position lookup tables for Select-free sparse traversal.
|
|
1205
|
+
uint64_t num_internal = 0;
|
|
1206
|
+
{
|
|
1207
|
+
if (remaining < 8) {
|
|
1208
|
+
return Status::Corruption(
|
|
1209
|
+
"Trie index: truncated child position table count");
|
|
1210
|
+
}
|
|
1211
|
+
memcpy(&num_internal, p, 8);
|
|
1212
|
+
p += 8;
|
|
1213
|
+
remaining -= 8;
|
|
1214
|
+
|
|
1215
|
+
// Guard against integer overflow in subsequent size computations.
|
|
1216
|
+
// Same limit as kMaxReasonableKeys used for num_keys_ validation.
|
|
1217
|
+
static constexpr uint64_t kMaxReasonableInternal = 1ULL << 30;
|
|
1218
|
+
if (num_internal > kMaxReasonableInternal) {
|
|
1219
|
+
return Status::Corruption(
|
|
1220
|
+
"Trie index: num_internal exceeds reasonable limit");
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
if (num_internal > 0) {
|
|
1224
|
+
// Read s_child_start_pos_
|
|
1225
|
+
size_t start_bytes = num_internal * sizeof(uint32_t);
|
|
1226
|
+
size_t start_padded = (start_bytes + 7) & ~size_t(7);
|
|
1227
|
+
if (remaining < start_padded) {
|
|
1228
|
+
return Status::Corruption(
|
|
1229
|
+
"Trie index: truncated child start position table");
|
|
1230
|
+
}
|
|
1231
|
+
s_child_start_pos_.resize(num_internal);
|
|
1232
|
+
memcpy(s_child_start_pos_.data(), p, start_bytes);
|
|
1233
|
+
p += start_padded;
|
|
1234
|
+
remaining -= start_padded;
|
|
1235
|
+
|
|
1236
|
+
// Read s_child_end_pos_
|
|
1237
|
+
size_t end_bytes = num_internal * sizeof(uint32_t);
|
|
1238
|
+
size_t end_padded = (end_bytes + 7) & ~size_t(7);
|
|
1239
|
+
if (remaining < end_padded) {
|
|
1240
|
+
return Status::Corruption(
|
|
1241
|
+
"Trie index: truncated child end position table");
|
|
1242
|
+
}
|
|
1243
|
+
s_child_end_pos_.resize(num_internal);
|
|
1244
|
+
memcpy(s_child_end_pos_.data(), p, end_bytes);
|
|
1245
|
+
p += end_padded;
|
|
1246
|
+
remaining -= end_padded;
|
|
1247
|
+
|
|
1248
|
+
// Validate that child position values are within the sparse label array.
|
|
1249
|
+
// A crafted input could have positions beyond s_labels_size_, causing
|
|
1250
|
+
// OOB reads during traversal when labels are accessed at these positions.
|
|
1251
|
+
for (uint64_t k = 0; k < num_internal; k++) {
|
|
1252
|
+
if (s_child_start_pos_[k] >= s_labels_size_ ||
|
|
1253
|
+
s_child_end_pos_[k] > s_labels_size_ ||
|
|
1254
|
+
s_child_end_pos_[k] < s_child_start_pos_[k]) {
|
|
1255
|
+
return Status::Corruption(
|
|
1256
|
+
"Trie index: child position out of range for sparse labels");
|
|
1257
|
+
}
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
// Path compression: chain metadata for fanout-1 chains.
|
|
1263
|
+
// Format: num_chains (uint64), then if > 0: bitmap + compact arrays.
|
|
1264
|
+
{
|
|
1265
|
+
if (remaining < 8) {
|
|
1266
|
+
return Status::Corruption("Trie index: truncated chain count");
|
|
1267
|
+
}
|
|
1268
|
+
uint64_t num_chains;
|
|
1269
|
+
memcpy(&num_chains, p, 8);
|
|
1270
|
+
p += 8;
|
|
1271
|
+
remaining -= 8;
|
|
1272
|
+
|
|
1273
|
+
if (num_chains > 0) {
|
|
1274
|
+
// Read chain bitmap.
|
|
1275
|
+
s = s_chain_bitmap_.InitFromData(p, remaining, &consumed);
|
|
1276
|
+
if (!s.ok()) {
|
|
1277
|
+
return s;
|
|
1278
|
+
}
|
|
1279
|
+
p += consumed;
|
|
1280
|
+
remaining -= consumed;
|
|
1281
|
+
|
|
1282
|
+
// Read compact chain_offsets (uint32_t per chain).
|
|
1283
|
+
size_t offsets_bytes = num_chains * sizeof(uint32_t);
|
|
1284
|
+
size_t offsets_padded = (offsets_bytes + 7) & ~size_t(7);
|
|
1285
|
+
if (remaining < offsets_padded) {
|
|
1286
|
+
return Status::Corruption("Trie index: truncated chain offsets");
|
|
1287
|
+
}
|
|
1288
|
+
s_chain_suffix_offsets_.resize(num_chains);
|
|
1289
|
+
memcpy(s_chain_suffix_offsets_.data(), p, offsets_bytes);
|
|
1290
|
+
p += offsets_padded;
|
|
1291
|
+
remaining -= offsets_padded;
|
|
1292
|
+
|
|
1293
|
+
// Read compact chain_lens (uint16_t per chain).
|
|
1294
|
+
size_t lens_bytes = num_chains * sizeof(uint16_t);
|
|
1295
|
+
size_t lens_padded = (lens_bytes + 7) & ~size_t(7);
|
|
1296
|
+
if (remaining < lens_padded) {
|
|
1297
|
+
return Status::Corruption("Trie index: truncated chain lengths");
|
|
1298
|
+
}
|
|
1299
|
+
s_chain_lens_.resize(num_chains);
|
|
1300
|
+
memcpy(s_chain_lens_.data(), p, lens_bytes);
|
|
1301
|
+
p += lens_padded;
|
|
1302
|
+
remaining -= lens_padded;
|
|
1303
|
+
|
|
1304
|
+
// Read compact chain_end_child_idx (uint32_t per chain).
|
|
1305
|
+
size_t end_bytes = num_chains * sizeof(uint32_t);
|
|
1306
|
+
size_t end_padded = (end_bytes + 7) & ~size_t(7);
|
|
1307
|
+
if (remaining < end_padded) {
|
|
1308
|
+
return Status::Corruption("Trie index: truncated chain end indices");
|
|
1309
|
+
}
|
|
1310
|
+
s_chain_end_child_idx_.resize(num_chains);
|
|
1311
|
+
memcpy(s_chain_end_child_idx_.data(), p, end_bytes);
|
|
1312
|
+
p += end_padded;
|
|
1313
|
+
remaining -= end_padded;
|
|
1314
|
+
|
|
1315
|
+
// Validate chain end child indices. Each value is either
|
|
1316
|
+
// UINT32_MAX (chain ends at a leaf) or an index into the child
|
|
1317
|
+
// position tables (< num_internal). An out-of-bounds index would
|
|
1318
|
+
// cause s_child_start_pos_[idx] OOB reads during chain traversal.
|
|
1319
|
+
for (size_t ci = 0; ci < num_chains; ci++) {
|
|
1320
|
+
uint32_t idx = s_chain_end_child_idx_[ci];
|
|
1321
|
+
if (idx != UINT32_MAX && idx >= num_internal) {
|
|
1322
|
+
return Status::Corruption(
|
|
1323
|
+
"Trie index: chain end child index out of range");
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
// Read suffix data blob.
|
|
1328
|
+
if (remaining < 8) {
|
|
1329
|
+
return Status::Corruption("Trie index: truncated chain suffix size");
|
|
1330
|
+
}
|
|
1331
|
+
memcpy(&s_chain_suffix_size_, p, 8);
|
|
1332
|
+
p += 8;
|
|
1333
|
+
remaining -= 8;
|
|
1334
|
+
|
|
1335
|
+
size_t suffix_padded = (s_chain_suffix_size_ + 7) & ~size_t(7);
|
|
1336
|
+
if (remaining < suffix_padded) {
|
|
1337
|
+
return Status::Corruption("Trie index: truncated chain suffix data");
|
|
1338
|
+
}
|
|
1339
|
+
s_chain_suffix_data_ = reinterpret_cast<const uint8_t*>(p);
|
|
1340
|
+
p += suffix_padded;
|
|
1341
|
+
remaining -= suffix_padded;
|
|
1342
|
+
|
|
1343
|
+
// Validate that each chain's suffix range fits within the suffix blob.
|
|
1344
|
+
// A crafted input could have offsets beyond the blob, causing OOB reads
|
|
1345
|
+
// during SeekImpl when chain suffixes are compared via memcmp.
|
|
1346
|
+
for (size_t ci = 0; ci < num_chains; ci++) {
|
|
1347
|
+
uint64_t end = static_cast<uint64_t>(s_chain_suffix_offsets_[ci]) +
|
|
1348
|
+
s_chain_lens_[ci];
|
|
1349
|
+
if (end > s_chain_suffix_size_) {
|
|
1350
|
+
return Status::Corruption(
|
|
1351
|
+
"Trie index: chain suffix offset + length exceeds suffix data");
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
// Block handles: packed uint32_t arrays for offsets and sizes.
|
|
1358
|
+
if (num_keys_ > 0) {
|
|
1359
|
+
size_t arr_bytes = num_keys_ * sizeof(uint32_t);
|
|
1360
|
+
size_t arr_padded = (arr_bytes + 7) & ~size_t(7);
|
|
1361
|
+
|
|
1362
|
+
// Read offsets array.
|
|
1363
|
+
if (remaining < arr_padded) {
|
|
1364
|
+
return Status::Corruption("Trie index: truncated handle offsets");
|
|
1365
|
+
}
|
|
1366
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint32_t) != 0) {
|
|
1367
|
+
return Status::Corruption("Trie index: handle offsets not aligned");
|
|
1368
|
+
}
|
|
1369
|
+
handle_offsets_ = reinterpret_cast<const uint32_t*>(p);
|
|
1370
|
+
p += arr_padded;
|
|
1371
|
+
remaining -= arr_padded;
|
|
1372
|
+
|
|
1373
|
+
// Read sizes array.
|
|
1374
|
+
if (remaining < arr_padded) {
|
|
1375
|
+
return Status::Corruption("Trie index: truncated handle sizes");
|
|
1376
|
+
}
|
|
1377
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint32_t) != 0) {
|
|
1378
|
+
return Status::Corruption("Trie index: handle sizes not aligned");
|
|
1379
|
+
}
|
|
1380
|
+
handle_sizes_ = reinterpret_cast<const uint32_t*>(p);
|
|
1381
|
+
p += arr_padded;
|
|
1382
|
+
remaining -= arr_padded;
|
|
1383
|
+
}
|
|
1384
|
+
|
|
1385
|
+
// =========================================================================
|
|
1386
|
+
// Seqno side-table: deserialized when kFlagSeqnoEncoding is set.
|
|
1387
|
+
// See SerializeAll() for the format description.
|
|
1388
|
+
// =========================================================================
|
|
1389
|
+
if (has_seqno_encoding_ && num_keys_ > 0) {
|
|
1390
|
+
// num_keys_ was already validated against kMaxReasonableKeys above.
|
|
1391
|
+
|
|
1392
|
+
// Read num_overflow_blocks (uint32_t) + padding (uint32_t).
|
|
1393
|
+
if (remaining < 8) {
|
|
1394
|
+
return Status::Corruption(
|
|
1395
|
+
"Trie index: truncated seqno side-table header");
|
|
1396
|
+
}
|
|
1397
|
+
memcpy(&num_overflow_blocks_, p, 4);
|
|
1398
|
+
p += 8; // skip num_overflow + padding
|
|
1399
|
+
remaining -= 8;
|
|
1400
|
+
|
|
1401
|
+
// Read leaf_seqnos: uint64_t[num_keys_]
|
|
1402
|
+
{
|
|
1403
|
+
size_t bytes = num_keys_ * sizeof(uint64_t);
|
|
1404
|
+
if (remaining < bytes) {
|
|
1405
|
+
return Status::Corruption("Trie index: truncated leaf seqnos");
|
|
1406
|
+
}
|
|
1407
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint64_t) != 0) {
|
|
1408
|
+
return Status::Corruption("Trie index: leaf seqnos not aligned");
|
|
1409
|
+
}
|
|
1410
|
+
leaf_seqnos_ = reinterpret_cast<const uint64_t*>(p);
|
|
1411
|
+
p += bytes;
|
|
1412
|
+
remaining -= bytes;
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
// Read leaf_block_counts: uint32_t[num_keys_] (padded to 8-byte)
|
|
1416
|
+
{
|
|
1417
|
+
size_t bytes = num_keys_ * sizeof(uint32_t);
|
|
1418
|
+
size_t padded = (bytes + 7) & ~size_t(7);
|
|
1419
|
+
if (remaining < padded) {
|
|
1420
|
+
return Status::Corruption("Trie index: truncated leaf block counts");
|
|
1421
|
+
}
|
|
1422
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint32_t) != 0) {
|
|
1423
|
+
return Status::Corruption("Trie index: leaf block counts not aligned");
|
|
1424
|
+
}
|
|
1425
|
+
leaf_block_counts_ = reinterpret_cast<const uint32_t*>(p);
|
|
1426
|
+
p += padded;
|
|
1427
|
+
remaining -= padded;
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
if (num_overflow_blocks_ > 0) {
|
|
1431
|
+
// Sanity-check num_overflow_blocks_ to prevent size_t overflow in
|
|
1432
|
+
// subsequent arithmetic, same rationale as the num_keys_ check above.
|
|
1433
|
+
if (num_overflow_blocks_ > kMaxReasonableKeys) {
|
|
1434
|
+
return Status::Corruption(
|
|
1435
|
+
"Trie index: num_overflow_blocks exceeds reasonable limit");
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
// Read overflow_offsets: uint32_t[num_overflow_blocks_] (padded)
|
|
1439
|
+
{
|
|
1440
|
+
size_t bytes = num_overflow_blocks_ * sizeof(uint32_t);
|
|
1441
|
+
size_t padded = (bytes + 7) & ~size_t(7);
|
|
1442
|
+
if (remaining < padded) {
|
|
1443
|
+
return Status::Corruption("Trie index: truncated overflow offsets");
|
|
1444
|
+
}
|
|
1445
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint32_t) != 0) {
|
|
1446
|
+
return Status::Corruption("Trie index: overflow offsets not aligned");
|
|
1447
|
+
}
|
|
1448
|
+
overflow_offsets_ = reinterpret_cast<const uint32_t*>(p);
|
|
1449
|
+
p += padded;
|
|
1450
|
+
remaining -= padded;
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
// Read overflow_sizes: uint32_t[num_overflow_blocks_] (padded)
|
|
1454
|
+
{
|
|
1455
|
+
size_t bytes = num_overflow_blocks_ * sizeof(uint32_t);
|
|
1456
|
+
size_t padded = (bytes + 7) & ~size_t(7);
|
|
1457
|
+
if (remaining < padded) {
|
|
1458
|
+
return Status::Corruption("Trie index: truncated overflow sizes");
|
|
1459
|
+
}
|
|
1460
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint32_t) != 0) {
|
|
1461
|
+
return Status::Corruption("Trie index: overflow sizes not aligned");
|
|
1462
|
+
}
|
|
1463
|
+
overflow_sizes_ = reinterpret_cast<const uint32_t*>(p);
|
|
1464
|
+
p += padded;
|
|
1465
|
+
remaining -= padded;
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
// Read overflow_seqnos: uint64_t[num_overflow_blocks_]
|
|
1469
|
+
// This is the last section in the side-table, so we intentionally do
|
|
1470
|
+
// not advance p/remaining after reading (no more data to parse).
|
|
1471
|
+
{
|
|
1472
|
+
size_t bytes = num_overflow_blocks_ * sizeof(uint64_t);
|
|
1473
|
+
if (remaining < bytes) {
|
|
1474
|
+
return Status::Corruption("Trie index: truncated overflow seqnos");
|
|
1475
|
+
}
|
|
1476
|
+
if (reinterpret_cast<uintptr_t>(p) % alignof(uint64_t) != 0) {
|
|
1477
|
+
return Status::Corruption("Trie index: overflow seqnos not aligned");
|
|
1478
|
+
}
|
|
1479
|
+
overflow_seqnos_ = reinterpret_cast<const uint64_t*>(p);
|
|
1480
|
+
}
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
// Compute overflow_base_ prefix sum for O(1) access to overflow arrays.
|
|
1484
|
+
// overflow_base_[i] = sum of (block_count-1) for leaves [0, i).
|
|
1485
|
+
overflow_base_.resize(num_keys_);
|
|
1486
|
+
uint32_t running_sum = 0;
|
|
1487
|
+
for (uint64_t i = 0; i < num_keys_; i++) {
|
|
1488
|
+
if (leaf_block_counts_[i] == 0) {
|
|
1489
|
+
return Status::Corruption(
|
|
1490
|
+
"Trie index: leaf block count is zero for leaf " +
|
|
1491
|
+
std::to_string(i));
|
|
1492
|
+
}
|
|
1493
|
+
overflow_base_[i] = running_sum;
|
|
1494
|
+
running_sum += leaf_block_counts_[i] - 1;
|
|
1495
|
+
}
|
|
1496
|
+
// Verify the prefix sum matches the declared overflow count.
|
|
1497
|
+
if (running_sum != num_overflow_blocks_) {
|
|
1498
|
+
return Status::Corruption(
|
|
1499
|
+
"Trie index: overflow count mismatch with leaf block counts");
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
return Status::OK();
|
|
1503
|
+
}
|
|
1504
|
+
|
|
1505
|
+
return Status::OK();
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
TrieBlockHandle LoudsTrie::GetHandle(uint64_t leaf_index) const {
|
|
1509
|
+
assert(leaf_index < num_keys_);
|
|
1510
|
+
return TrieBlockHandle{handle_offsets_[leaf_index],
|
|
1511
|
+
handle_sizes_[leaf_index]};
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1514
|
+
// ============================================================================
|
|
1515
|
+
// LoudsTrieIterator implementation
|
|
1516
|
+
//
|
|
1517
|
+
// The iterator maintains a stack of positions (path_) from the root to the
|
|
1518
|
+
// current leaf. Each entry records the position in the bitvector/label array
|
|
1519
|
+
// at that depth, enabling backtracking for Next() and key reconstruction.
|
|
1520
|
+
//
|
|
1521
|
+
// Dense positions: pos = node_num * 256 + label_byte. The node_num is the
|
|
1522
|
+
// global node number across all dense levels (concatenated). The label_byte
|
|
1523
|
+
// is pos % 256.
|
|
1524
|
+
//
|
|
1525
|
+
// Sparse positions: pos = index into s_labels_ array.
|
|
1526
|
+
//
|
|
1527
|
+
// Leaf ordering: BFS leaf order matches sorted key order. For each node in
|
|
1528
|
+
// BFS order, first its prefix key leaf (if any), then its child leaves
|
|
1529
|
+
// (has_child = 0) in label order.
|
|
1530
|
+
// ============================================================================
|
|
1531
|
+
|
|
1532
|
+
LoudsTrieIterator::LoudsTrieIterator(const LoudsTrie* trie)
|
|
1533
|
+
: has_chains_(trie->HasChains()),
|
|
1534
|
+
trie_(trie),
|
|
1535
|
+
valid_(false),
|
|
1536
|
+
leaf_index_(0),
|
|
1537
|
+
key_len_(0),
|
|
1538
|
+
key_cap_(0),
|
|
1539
|
+
is_at_prefix_key_(false) {
|
|
1540
|
+
// Allocate key buffer once, sized to the longest possible trie path.
|
|
1541
|
+
// MaxDepth() is the length of the longest key, so no traversal will
|
|
1542
|
+
// exceed this depth.
|
|
1543
|
+
key_cap_ = trie_->MaxDepth() + 1;
|
|
1544
|
+
key_buf_ = std::make_unique<char[]>(key_cap_);
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1547
|
+
// --- Dense helpers ---
|
|
1548
|
+
|
|
1549
|
+
bool LoudsTrieIterator::DenseSeekLabel(uint64_t node_num, uint8_t target_byte,
|
|
1550
|
+
uint64_t* out_pos) {
|
|
1551
|
+
uint64_t pos = node_num * 256 + target_byte;
|
|
1552
|
+
if (pos < trie_->d_labels_.NumBits() && trie_->d_labels_.GetBit(pos)) {
|
|
1553
|
+
*out_pos = pos;
|
|
1554
|
+
return true;
|
|
1555
|
+
}
|
|
1556
|
+
uint64_t node_end = (node_num + 1) * 256;
|
|
1557
|
+
uint64_t next = trie_->d_labels_.NextSetBit(pos + 1);
|
|
1558
|
+
if (next < node_end && next < trie_->d_labels_.NumBits()) {
|
|
1559
|
+
*out_pos = next;
|
|
1560
|
+
return false;
|
|
1561
|
+
}
|
|
1562
|
+
*out_pos = node_end;
|
|
1563
|
+
return false;
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
uint64_t LoudsTrieIterator::DenseChildNodeNumFromRank(
|
|
1567
|
+
uint64_t label_rank) const {
|
|
1568
|
+
// In the concatenated dense model, node 0 is the root (no parent).
|
|
1569
|
+
// Each internal child (d_has_child_ bit = 1) creates a new node numbered
|
|
1570
|
+
// 1, 2, 3, ... in BFS order. The child's global node number =
|
|
1571
|
+
// rank1(d_has_child_, label_rank + 1).
|
|
1572
|
+
return trie_->d_has_child_.Rank1(label_rank + 1);
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
uint64_t LoudsTrieIterator::DenseLeafIndexFromRank(uint64_t pos,
|
|
1576
|
+
uint64_t label_rank) const {
|
|
1577
|
+
uint64_t has_child_rank = trie_->d_has_child_.Rank1(label_rank + 1);
|
|
1578
|
+
return DenseLeafIndexFromRankAndHasChildRank(pos, label_rank, has_child_rank);
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
uint64_t LoudsTrieIterator::DenseLeafIndexFromRankAndHasChildRank(
|
|
1582
|
+
uint64_t pos, uint64_t label_rank, uint64_t has_child_rank) const {
|
|
1583
|
+
// Leaf ordinal for a dense leaf at position `pos` in d_labels_.
|
|
1584
|
+
//
|
|
1585
|
+
// BFS leaf order: for each node 0..N, prefix key first, then non-internal
|
|
1586
|
+
// children. So leaf index =
|
|
1587
|
+
// (prefix keys at nodes [0, node_num]) +
|
|
1588
|
+
// (non-internal labels at positions [0, label_rank]) - 1 (0-indexed).
|
|
1589
|
+
//
|
|
1590
|
+
// When there are no prefix keys, the prefix_keys term is zero and we
|
|
1591
|
+
// can skip the d_is_prefix_key_.Rank1 call entirely.
|
|
1592
|
+
uint64_t leaf_labels = (label_rank + 1) - has_child_rank;
|
|
1593
|
+
if (trie_->d_is_prefix_key_.NumOnes() == 0) {
|
|
1594
|
+
return leaf_labels - 1;
|
|
1595
|
+
}
|
|
1596
|
+
uint64_t node_num = pos / 256;
|
|
1597
|
+
uint64_t prefix_keys = trie_->d_is_prefix_key_.Rank1(node_num + 1);
|
|
1598
|
+
return prefix_keys + leaf_labels - 1;
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
uint64_t LoudsTrieIterator::DensePrefixKeyLeafIndex(uint64_t node_num) const {
|
|
1602
|
+
// Leaf ordinal for a prefix key at dense node N. The prefix key comes
|
|
1603
|
+
// after all leaves at nodes [0, N-1] and before leaf labels at node N.
|
|
1604
|
+
// Index = (prefix keys before N) + (non-internal labels before N).
|
|
1605
|
+
uint64_t labels_before_node = trie_->d_labels_.Rank1(node_num * 256);
|
|
1606
|
+
uint64_t internal_before = trie_->d_has_child_.Rank1(labels_before_node);
|
|
1607
|
+
uint64_t leaf_labels_before = labels_before_node - internal_before;
|
|
1608
|
+
uint64_t prefix_keys_before = trie_->d_is_prefix_key_.Rank1(node_num);
|
|
1609
|
+
return prefix_keys_before + leaf_labels_before;
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
// --- Sparse helpers ---
|
|
1613
|
+
|
|
1614
|
+
bool LoudsTrieIterator::SparseSeekLabel(uint64_t node_start_pos,
|
|
1615
|
+
uint64_t node_end_pos,
|
|
1616
|
+
uint8_t target_byte,
|
|
1617
|
+
uint64_t* out_pos) {
|
|
1618
|
+
// Labels within a sparse node are stored in sorted order. Find the first
|
|
1619
|
+
// label >= target_byte.
|
|
1620
|
+
//
|
|
1621
|
+
// Strategy: use linear scan for small nodes (<=16 labels) and binary
|
|
1622
|
+
// search for larger nodes. Profiling shows that the vast majority of
|
|
1623
|
+
// sparse nodes have small fanout (often 1-10 children), where linear
|
|
1624
|
+
// scan is faster than std::lower_bound due to sequential memory access
|
|
1625
|
+
// and predictable branches. The threshold of 16 was chosen because
|
|
1626
|
+
// 16 bytes fits in a single cache line and binary search gains an
|
|
1627
|
+
// advantage only around log2(16)=4 comparisons with ~50% branch
|
|
1628
|
+
// misprediction rate.
|
|
1629
|
+
static constexpr uint64_t kLinearScanThreshold = 16;
|
|
1630
|
+
|
|
1631
|
+
const uint8_t* base = trie_->s_labels_data_;
|
|
1632
|
+
const uint8_t* begin = base + node_start_pos;
|
|
1633
|
+
const uint8_t* end = base + node_end_pos;
|
|
1634
|
+
uint64_t size = node_end_pos - node_start_pos;
|
|
1635
|
+
|
|
1636
|
+
const uint8_t* it;
|
|
1637
|
+
if (size <= kLinearScanThreshold) {
|
|
1638
|
+
// Linear scan: fastest for small nodes.
|
|
1639
|
+
it = begin;
|
|
1640
|
+
while (it < end && *it < target_byte) {
|
|
1641
|
+
++it;
|
|
1642
|
+
}
|
|
1643
|
+
} else {
|
|
1644
|
+
it = std::lower_bound(begin, end, target_byte);
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
if (it == end) {
|
|
1648
|
+
*out_pos = node_end_pos;
|
|
1649
|
+
return false;
|
|
1650
|
+
}
|
|
1651
|
+
*out_pos = static_cast<uint64_t>(it - base);
|
|
1652
|
+
return (*it == target_byte);
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
uint64_t LoudsTrieIterator::SparseChildNodeNum(uint64_t pos) const {
|
|
1656
|
+
// The first dense_child_count_ sparse nodes are children of the last
|
|
1657
|
+
// dense level. Additional sparse internal children add nodes after that.
|
|
1658
|
+
// child_node = dense_child_count_ + rank1(s_has_child_, pos+1) - 1
|
|
1659
|
+
return trie_->dense_child_count_ + trie_->s_has_child_.Rank1(pos + 1) - 1;
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
uint64_t LoudsTrieIterator::SparseLeafIndex(uint64_t pos) const {
|
|
1663
|
+
uint64_t has_child_rank = trie_->s_has_child_.Rank1(pos + 1);
|
|
1664
|
+
return SparseLeafIndexFromHasChildRank(pos, has_child_rank);
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
uint64_t LoudsTrieIterator::SparseLeafIndexFromHasChildRank(
|
|
1668
|
+
uint64_t pos, uint64_t has_child_rank) const {
|
|
1669
|
+
// Leaf ordinal for a sparse leaf at position pos.
|
|
1670
|
+
// = dense_leaf_count + prefix_keys_at_nodes[0..N] +
|
|
1671
|
+
// non-internal_labels[0..pos] - 1
|
|
1672
|
+
//
|
|
1673
|
+
// When there are no prefix keys (the common case), the prefix_keys term
|
|
1674
|
+
// is zero and we can skip both SparseNodeNum (1 Rank1 on s_louds_) and
|
|
1675
|
+
// s_is_prefix_key_.Rank1, leaving only the precomputed has_child rank.
|
|
1676
|
+
uint64_t leaf_labels = (pos + 1) - has_child_rank;
|
|
1677
|
+
|
|
1678
|
+
if (trie_->s_is_prefix_key_.NumOnes() == 0) {
|
|
1679
|
+
return trie_->dense_leaf_count_ + leaf_labels - 1;
|
|
1680
|
+
}
|
|
1681
|
+
uint64_t sparse_node = SparseNodeNum(pos);
|
|
1682
|
+
uint64_t prefix_keys = trie_->s_is_prefix_key_.Rank1(sparse_node + 1);
|
|
1683
|
+
return trie_->dense_leaf_count_ + prefix_keys + leaf_labels - 1;
|
|
1684
|
+
}
|
|
1685
|
+
|
|
1686
|
+
uint64_t LoudsTrieIterator::SparsePrefixKeyLeafIndex(
|
|
1687
|
+
uint64_t sparse_node_num) const {
|
|
1688
|
+
// Leaf ordinal for a sparse prefix key. Same logic as
|
|
1689
|
+
// DensePrefixKeyLeafIndex but offset by dense_leaf_count_.
|
|
1690
|
+
uint64_t start_pos = SparseNodeStartPos(sparse_node_num);
|
|
1691
|
+
uint64_t prefix_keys_before = trie_->s_is_prefix_key_.Rank1(sparse_node_num);
|
|
1692
|
+
uint64_t internal_before = trie_->s_has_child_.Rank1(start_pos);
|
|
1693
|
+
uint64_t leaf_labels_before = start_pos - internal_before;
|
|
1694
|
+
return trie_->dense_leaf_count_ + prefix_keys_before + leaf_labels_before;
|
|
1695
|
+
}
|
|
1696
|
+
|
|
1697
|
+
uint64_t LoudsTrieIterator::SparseNodeNum(uint64_t pos) const {
|
|
1698
|
+
return trie_->s_louds_.Rank1(pos + 1) - 1;
|
|
1699
|
+
}
|
|
1700
|
+
|
|
1701
|
+
uint64_t LoudsTrieIterator::SparseNodeStartPos(uint64_t sparse_node_num) const {
|
|
1702
|
+
if (sparse_node_num == 0) {
|
|
1703
|
+
return 0;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
// Use the precomputed lookup table for nodes that are children of sparse
|
|
1707
|
+
// internal labels. These are the most common case during traversal.
|
|
1708
|
+
if (sparse_node_num >= trie_->dense_child_count_ &&
|
|
1709
|
+
!trie_->s_child_start_pos_.empty()) {
|
|
1710
|
+
uint64_t internal_idx = sparse_node_num - trie_->dense_child_count_;
|
|
1711
|
+
if (internal_idx < trie_->s_child_start_pos_.size()) {
|
|
1712
|
+
return trie_->s_child_start_pos_[internal_idx];
|
|
1713
|
+
}
|
|
1714
|
+
}
|
|
1715
|
+
|
|
1716
|
+
// Fallback to FindNthOneBit for:
|
|
1717
|
+
// - Root sparse nodes (children of dense nodes)
|
|
1718
|
+
// - Very large tries where lookup table wasn't built
|
|
1719
|
+
return trie_->s_louds_.FindNthOneBit(sparse_node_num);
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
uint64_t LoudsTrieIterator::SparseNodeEndPos(uint64_t start_pos) const {
|
|
1723
|
+
uint64_t next = trie_->s_louds_.NextSetBit(start_pos + 1);
|
|
1724
|
+
if (next >= trie_->s_louds_.NumBits()) {
|
|
1725
|
+
return trie_->s_labels_size_;
|
|
1726
|
+
}
|
|
1727
|
+
return next;
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
// Helper: descend from (in_dense, node_num) to leftmost leaf, pushing
|
|
1731
|
+
// path entries and building key_buf_. Returns true if a leaf was found.
|
|
1732
|
+
bool LoudsTrieIterator::DescendToLeftmostLeaf(bool in_dense,
|
|
1733
|
+
uint64_t node_num) {
|
|
1734
|
+
while (true) {
|
|
1735
|
+
if (in_dense) {
|
|
1736
|
+
// Check prefix key first.
|
|
1737
|
+
if (trie_->d_is_prefix_key_.NumBits() > 0 &&
|
|
1738
|
+
node_num < trie_->d_is_prefix_key_.NumBits() &&
|
|
1739
|
+
trie_->d_is_prefix_key_.GetBit(node_num)) {
|
|
1740
|
+
is_at_prefix_key_ = true;
|
|
1741
|
+
leaf_index_ = DensePrefixKeyLeafIndex(node_num);
|
|
1742
|
+
valid_ = true;
|
|
1743
|
+
return true;
|
|
1744
|
+
}
|
|
1745
|
+
|
|
1746
|
+
uint64_t base = node_num * 256;
|
|
1747
|
+
if (base >= trie_->d_labels_.NumBits()) {
|
|
1748
|
+
valid_ = false;
|
|
1749
|
+
return false;
|
|
1750
|
+
}
|
|
1751
|
+
uint64_t first = trie_->d_labels_.NextSetBit(base);
|
|
1752
|
+
if (first >= base + 256 || first >= trie_->d_labels_.NumBits()) {
|
|
1753
|
+
valid_ = false;
|
|
1754
|
+
return false;
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1757
|
+
path_.push_back(LevelPos::MakeDense(first));
|
|
1758
|
+
AppendKeySlot() = static_cast<char>(first % 256);
|
|
1759
|
+
|
|
1760
|
+
uint64_t label_rank = trie_->d_labels_.Rank1(first + 1) - 1;
|
|
1761
|
+
if (!trie_->d_has_child_.GetBit(label_rank)) {
|
|
1762
|
+
leaf_index_ = DenseLeafIndexFromRank(first, label_rank);
|
|
1763
|
+
valid_ = true;
|
|
1764
|
+
return true;
|
|
1765
|
+
}
|
|
1766
|
+
|
|
1767
|
+
uint64_t child = DenseChildNodeNumFromRank(label_rank);
|
|
1768
|
+
if (child < trie_->dense_node_count_) {
|
|
1769
|
+
node_num = child;
|
|
1770
|
+
in_dense = true;
|
|
1771
|
+
} else {
|
|
1772
|
+
node_num = child - trie_->dense_node_count_;
|
|
1773
|
+
in_dense = false;
|
|
1774
|
+
}
|
|
1775
|
+
} else {
|
|
1776
|
+
// Check prefix key first.
|
|
1777
|
+
if (trie_->s_is_prefix_key_.NumBits() > 0 &&
|
|
1778
|
+
node_num < trie_->s_is_prefix_key_.NumBits() &&
|
|
1779
|
+
trie_->s_is_prefix_key_.GetBit(node_num)) {
|
|
1780
|
+
is_at_prefix_key_ = true;
|
|
1781
|
+
leaf_index_ = SparsePrefixKeyLeafIndex(node_num);
|
|
1782
|
+
valid_ = true;
|
|
1783
|
+
return true;
|
|
1784
|
+
}
|
|
1785
|
+
|
|
1786
|
+
uint64_t start = SparseNodeStartPos(node_num);
|
|
1787
|
+
if (start >= trie_->s_labels_size_) {
|
|
1788
|
+
valid_ = false;
|
|
1789
|
+
return false;
|
|
1790
|
+
}
|
|
1791
|
+
|
|
1792
|
+
path_.push_back(LevelPos::MakeSparse(start));
|
|
1793
|
+
AppendKeySlot() = static_cast<char>(trie_->s_labels_data_[start]);
|
|
1794
|
+
|
|
1795
|
+
if (!trie_->s_has_child_.GetBit(start)) {
|
|
1796
|
+
leaf_index_ = SparseLeafIndex(start);
|
|
1797
|
+
valid_ = true;
|
|
1798
|
+
return true;
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
node_num = SparseChildNodeNum(start);
|
|
1802
|
+
in_dense = false;
|
|
1803
|
+
}
|
|
1804
|
+
}
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
bool LoudsTrieIterator::SeekToFirst() {
|
|
1808
|
+
valid_ = false;
|
|
1809
|
+
leaf_index_ = 0;
|
|
1810
|
+
key_len_ = 0;
|
|
1811
|
+
path_.clear();
|
|
1812
|
+
is_at_prefix_key_ = false;
|
|
1813
|
+
|
|
1814
|
+
if (trie_->NumKeys() == 0) {
|
|
1815
|
+
return false;
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
// Descend directly from root to the leftmost leaf.
|
|
1819
|
+
// Compared to Seek(""), this skips the SeekImpl target-consumption loop
|
|
1820
|
+
// (a no-op for empty target) and the redundant prefix key check that
|
|
1821
|
+
// SeekImpl performs at the root before calling DescendToLeftmostLeaf.
|
|
1822
|
+
// DescendToLeftmostLeaf itself handles prefix keys at every node.
|
|
1823
|
+
bool in_dense = (trie_->cutoff_level_ > 0);
|
|
1824
|
+
return DescendToLeftmostLeaf(in_dense, /*node_num=*/0);
|
|
1825
|
+
}
|
|
1826
|
+
|
|
1827
|
+
// Main Seek implementation.
|
|
1828
|
+
// Uses SuRF-style Select-free traversal for sparse regions: instead of
|
|
1829
|
+
// tracking node_num and calling FindNthOneBit to find node boundaries, we
|
|
1830
|
+
// track (start_pos, end_pos) directly and use only Rank1 + array lookup.
|
|
1831
|
+
template <bool kHasChains>
|
|
1832
|
+
bool LoudsTrieIterator::SeekImpl(const Slice& target) {
|
|
1833
|
+
valid_ = false;
|
|
1834
|
+
leaf_index_ = 0;
|
|
1835
|
+
key_len_ = 0;
|
|
1836
|
+
path_.clear();
|
|
1837
|
+
is_at_prefix_key_ = false;
|
|
1838
|
+
|
|
1839
|
+
if (trie_->NumKeys() == 0) {
|
|
1840
|
+
return false;
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1843
|
+
bool in_dense = (trie_->cutoff_level_ > 0);
|
|
1844
|
+
uint64_t node_num = 0;
|
|
1845
|
+
|
|
1846
|
+
// SuRF-style: For sparse traversal, track (start_pos, end_pos) directly
|
|
1847
|
+
// instead of node_num. This eliminates FindNthOneBit calls entirely.
|
|
1848
|
+
uint64_t sparse_start = 0;
|
|
1849
|
+
uint64_t sparse_end = 0;
|
|
1850
|
+
bool have_sparse_bounds = false;
|
|
1851
|
+
|
|
1852
|
+
assert(target.size() <= UINT32_MAX);
|
|
1853
|
+
for (uint32_t depth = 0; depth < static_cast<uint32_t>(target.size());
|
|
1854
|
+
depth++) {
|
|
1855
|
+
uint8_t target_byte = static_cast<uint8_t>(target[depth]);
|
|
1856
|
+
|
|
1857
|
+
if (in_dense) {
|
|
1858
|
+
uint64_t pos;
|
|
1859
|
+
bool exact = DenseSeekLabel(node_num, target_byte, &pos);
|
|
1860
|
+
uint64_t node_end = (node_num + 1) * 256;
|
|
1861
|
+
|
|
1862
|
+
if (pos >= node_end) {
|
|
1863
|
+
// No label >= target_byte. Backtrack.
|
|
1864
|
+
return Advance();
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
path_.push_back(LevelPos::MakeDense(pos));
|
|
1868
|
+
AppendKeySlot() = static_cast<char>(pos % 256);
|
|
1869
|
+
|
|
1870
|
+
// Cache label_rank: avoids redundant Rank1(d_labels_) in both
|
|
1871
|
+
// has_child check and DenseChildNodeNumFromRank/DenseLeafIndexFromRank.
|
|
1872
|
+
uint64_t label_rank = trie_->d_labels_.Rank1(pos + 1) - 1;
|
|
1873
|
+
bool is_internal = trie_->d_has_child_.GetBit(label_rank);
|
|
1874
|
+
// Compute has_child_rank once, reuse for both leaf index and child
|
|
1875
|
+
// node number. DenseChildNodeNumFromRank(lr) = Rank1(lr + 1) and
|
|
1876
|
+
// DenseLeafIndexFromRankAndHasChildRank also needs Rank1(lr + 1).
|
|
1877
|
+
uint64_t has_child_rank = trie_->d_has_child_.Rank1(label_rank + 1);
|
|
1878
|
+
|
|
1879
|
+
if (!exact) {
|
|
1880
|
+
// Landed on label > target_byte. Go to leftmost leaf in subtree.
|
|
1881
|
+
if (!is_internal) {
|
|
1882
|
+
leaf_index_ = DenseLeafIndexFromRankAndHasChildRank(pos, label_rank,
|
|
1883
|
+
has_child_rank);
|
|
1884
|
+
valid_ = true;
|
|
1885
|
+
return true;
|
|
1886
|
+
}
|
|
1887
|
+
uint64_t child = has_child_rank; // = DenseChildNodeNumFromRank(lr)
|
|
1888
|
+
bool child_dense = (child < trie_->dense_node_count_);
|
|
1889
|
+
uint64_t cn = child_dense ? child : child - trie_->dense_node_count_;
|
|
1890
|
+
return DescendToLeftmostLeaf(child_dense, cn);
|
|
1891
|
+
}
|
|
1892
|
+
|
|
1893
|
+
if (!is_internal) {
|
|
1894
|
+
// This label is a leaf. Check if target is fully consumed.
|
|
1895
|
+
if (depth == static_cast<uint32_t>(target.size()) - 1) {
|
|
1896
|
+
// Exact match: trie key == target.
|
|
1897
|
+
leaf_index_ = DenseLeafIndexFromRankAndHasChildRank(pos, label_rank,
|
|
1898
|
+
has_child_rank);
|
|
1899
|
+
valid_ = true;
|
|
1900
|
+
return true;
|
|
1901
|
+
}
|
|
1902
|
+
// Target has more bytes, so trie key is a proper prefix of target.
|
|
1903
|
+
// Trie key < target. Advance to the NEXT leaf.
|
|
1904
|
+
leaf_index_ = DenseLeafIndexFromRankAndHasChildRank(pos, label_rank,
|
|
1905
|
+
has_child_rank);
|
|
1906
|
+
valid_ = true;
|
|
1907
|
+
return Advance();
|
|
1908
|
+
}
|
|
1909
|
+
|
|
1910
|
+
uint64_t child = has_child_rank; // = DenseChildNodeNumFromRank(lr)
|
|
1911
|
+
if (child < trie_->dense_node_count_) {
|
|
1912
|
+
node_num = child;
|
|
1913
|
+
in_dense = true;
|
|
1914
|
+
} else {
|
|
1915
|
+
node_num = child - trie_->dense_node_count_;
|
|
1916
|
+
in_dense = false;
|
|
1917
|
+
have_sparse_bounds = false; // Will compute on first sparse access
|
|
1918
|
+
}
|
|
1919
|
+
} else {
|
|
1920
|
+
// SuRF-style sparse traversal: Use (start, end) positions directly.
|
|
1921
|
+
// No FindNthOneBit calls - only Rank1 + array lookup.
|
|
1922
|
+
uint64_t start;
|
|
1923
|
+
uint64_t end;
|
|
1924
|
+
if (have_sparse_bounds) {
|
|
1925
|
+
start = sparse_start;
|
|
1926
|
+
end = sparse_end;
|
|
1927
|
+
} else {
|
|
1928
|
+
// First entry into sparse region from dense, or from root.
|
|
1929
|
+
// Need to compute bounds using FindNthOneBit (only once per
|
|
1930
|
+
// dense->sparse).
|
|
1931
|
+
start = SparseNodeStartPos(node_num);
|
|
1932
|
+
end = SparseNodeEndPos(start);
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1935
|
+
// Fast path for fanout-1 nodes: the most common case for tries built
|
|
1936
|
+
// from keys with long common prefixes (e.g., zero-padded numeric keys).
|
|
1937
|
+
// Avoids the SparseSeekLabel function call and reduces branch logic to
|
|
1938
|
+
// a single byte comparison.
|
|
1939
|
+
if (start + 1 == end) {
|
|
1940
|
+
uint8_t label = trie_->s_labels_data_[start];
|
|
1941
|
+
|
|
1942
|
+
if (label == target_byte) {
|
|
1943
|
+
// Exact match (the overwhelmingly common case for fanout-1).
|
|
1944
|
+
path_.push_back(LevelPos::MakeSparse(start));
|
|
1945
|
+
AppendKeySlot() = static_cast<char>(label);
|
|
1946
|
+
|
|
1947
|
+
bool is_internal = trie_->s_has_child_.GetBit(start);
|
|
1948
|
+
uint64_t has_child_rank = trie_->s_has_child_.Rank1(start + 1);
|
|
1949
|
+
if (is_internal) {
|
|
1950
|
+
// Internal node: use rank for child lookup.
|
|
1951
|
+
if (!trie_->s_child_start_pos_.empty()) {
|
|
1952
|
+
uint64_t child_idx = has_child_rank - 1;
|
|
1953
|
+
if (child_idx < trie_->s_child_start_pos_.size()) {
|
|
1954
|
+
// ---- Path compression: chain skip ----
|
|
1955
|
+
// Check if this child starts a fanout-1 chain. If so, compare
|
|
1956
|
+
// the remaining target bytes against the chain suffix with a
|
|
1957
|
+
// single memcmp instead of traversing level by level.
|
|
1958
|
+
// Guarded by if constexpr: when kHasChains=false, the
|
|
1959
|
+
// compiler eliminates this entire block from the generated
|
|
1960
|
+
// code.
|
|
1961
|
+
if constexpr (kHasChains) {
|
|
1962
|
+
if (child_idx < trie_->s_chain_bitmap_.NumBits() &&
|
|
1963
|
+
trie_->s_chain_bitmap_.GetBit(child_idx)) {
|
|
1964
|
+
uint64_t chain_idx =
|
|
1965
|
+
trie_->s_chain_bitmap_.Rank1(child_idx + 1) - 1;
|
|
1966
|
+
uint16_t chain_len = trie_->s_chain_lens_[chain_idx];
|
|
1967
|
+
const uint8_t* suffix =
|
|
1968
|
+
trie_->s_chain_suffix_data_ +
|
|
1969
|
+
trie_->s_chain_suffix_offsets_[chain_idx];
|
|
1970
|
+
uint32_t target_remaining =
|
|
1971
|
+
static_cast<uint32_t>(target.size()) - depth - 1;
|
|
1972
|
+
|
|
1973
|
+
if (target_remaining >= chain_len) {
|
|
1974
|
+
// Target has enough bytes to cover the full chain.
|
|
1975
|
+
const uint8_t* target_bytes =
|
|
1976
|
+
reinterpret_cast<const uint8_t*>(target.data()) +
|
|
1977
|
+
depth + 1;
|
|
1978
|
+
int cmp = memcmp(target_bytes, suffix, chain_len);
|
|
1979
|
+
if (cmp == 0) {
|
|
1980
|
+
// Full chain match! Push all chain nodes onto path_.
|
|
1981
|
+
// Walk the chain using child position tables to get
|
|
1982
|
+
// each node's label position for path reconstruction.
|
|
1983
|
+
uint64_t cur_idx = child_idx;
|
|
1984
|
+
for (uint16_t ci = 0; ci < chain_len; ci++) {
|
|
1985
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
1986
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
1987
|
+
AppendKeySlot() =
|
|
1988
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
1989
|
+
// Move to next chain node (last node handled
|
|
1990
|
+
// below).
|
|
1991
|
+
if (ci + 1 < chain_len) {
|
|
1992
|
+
cur_idx = trie_->s_has_child_.Rank1(cs + 1) - 1;
|
|
1993
|
+
}
|
|
1994
|
+
}
|
|
1995
|
+
// Chain fully matched. Advance depth past the chain.
|
|
1996
|
+
depth += chain_len;
|
|
1997
|
+
|
|
1998
|
+
// Set up for the node after the chain.
|
|
1999
|
+
uint32_t end_child_idx =
|
|
2000
|
+
trie_->s_chain_end_child_idx_[chain_idx];
|
|
2001
|
+
if (end_child_idx == UINT32_MAX) {
|
|
2002
|
+
// Chain ends at a leaf. Check if target is
|
|
2003
|
+
// consumed.
|
|
2004
|
+
uint32_t last_cs = trie_->s_child_start_pos_[cur_idx];
|
|
2005
|
+
uint64_t last_hcr =
|
|
2006
|
+
trie_->s_has_child_.Rank1(last_cs + 1);
|
|
2007
|
+
if (depth ==
|
|
2008
|
+
static_cast<uint32_t>(target.size()) - 1) {
|
|
2009
|
+
leaf_index_ = SparseLeafIndexFromHasChildRank(
|
|
2010
|
+
last_cs, last_hcr);
|
|
2011
|
+
valid_ = true;
|
|
2012
|
+
return true;
|
|
2013
|
+
}
|
|
2014
|
+
// Target has more bytes, trie key < target.
|
|
2015
|
+
// Advance.
|
|
2016
|
+
leaf_index_ = SparseLeafIndexFromHasChildRank(
|
|
2017
|
+
last_cs, last_hcr);
|
|
2018
|
+
valid_ = true;
|
|
2019
|
+
return Advance();
|
|
2020
|
+
}
|
|
2021
|
+
// Chain ends at an internal node with fanout > 1.
|
|
2022
|
+
sparse_start = trie_->s_child_start_pos_[end_child_idx];
|
|
2023
|
+
sparse_end = trie_->s_child_end_pos_[end_child_idx];
|
|
2024
|
+
have_sparse_bounds = true;
|
|
2025
|
+
in_dense = false;
|
|
2026
|
+
continue;
|
|
2027
|
+
}
|
|
2028
|
+
// Chain mismatch. Find the divergence point.
|
|
2029
|
+
uint16_t mismatch_pos = 0;
|
|
2030
|
+
while (mismatch_pos < chain_len &&
|
|
2031
|
+
target_bytes[mismatch_pos] ==
|
|
2032
|
+
suffix[mismatch_pos]) {
|
|
2033
|
+
mismatch_pos++;
|
|
2034
|
+
}
|
|
2035
|
+
// Push path entries up to the mismatch point.
|
|
2036
|
+
uint64_t cur_idx = child_idx;
|
|
2037
|
+
for (uint16_t ci = 0; ci < mismatch_pos; ci++) {
|
|
2038
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
2039
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2040
|
+
AppendKeySlot() =
|
|
2041
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2042
|
+
cur_idx = trie_->s_has_child_.Rank1(cs + 1) - 1;
|
|
2043
|
+
}
|
|
2044
|
+
// At the mismatch node: push the node's label and
|
|
2045
|
+
// handle.
|
|
2046
|
+
uint32_t mis_cs = trie_->s_child_start_pos_[cur_idx];
|
|
2047
|
+
path_.push_back(LevelPos::MakeSparse(mis_cs));
|
|
2048
|
+
AppendKeySlot() =
|
|
2049
|
+
static_cast<char>(trie_->s_labels_data_[mis_cs]);
|
|
2050
|
+
|
|
2051
|
+
if (target_bytes[mismatch_pos] < suffix[mismatch_pos]) {
|
|
2052
|
+
// Target < chain. Chain's path leads to the first key
|
|
2053
|
+
// >= target. Descend to leftmost leaf from here.
|
|
2054
|
+
if (!trie_->s_has_child_.GetBit(mis_cs)) {
|
|
2055
|
+
leaf_index_ = SparseLeafIndex(mis_cs);
|
|
2056
|
+
valid_ = true;
|
|
2057
|
+
return true;
|
|
2058
|
+
}
|
|
2059
|
+
return DescendToLeftmostLeaf(
|
|
2060
|
+
false, SparseChildNodeNum(mis_cs));
|
|
2061
|
+
}
|
|
2062
|
+
// target_bytes[mismatch_pos] > suffix[mismatch_pos]:
|
|
2063
|
+
// All keys through this chain node are < target.
|
|
2064
|
+
// Advance.
|
|
2065
|
+
return Advance();
|
|
2066
|
+
}
|
|
2067
|
+
// Target runs out before chain ends.
|
|
2068
|
+
// Check if target's remaining bytes match the chain
|
|
2069
|
+
// prefix.
|
|
2070
|
+
if (target_remaining > 0) {
|
|
2071
|
+
const uint8_t* target_bytes =
|
|
2072
|
+
reinterpret_cast<const uint8_t*>(target.data()) +
|
|
2073
|
+
depth + 1;
|
|
2074
|
+
int cmp = memcmp(target_bytes, suffix, target_remaining);
|
|
2075
|
+
if (cmp < 0) {
|
|
2076
|
+
// Target < chain prefix. Push matched portion.
|
|
2077
|
+
uint64_t cur_idx = child_idx;
|
|
2078
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
2079
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2080
|
+
AppendKeySlot() =
|
|
2081
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2082
|
+
// Descend to leftmost leaf from this first chain
|
|
2083
|
+
// node.
|
|
2084
|
+
if (!trie_->s_has_child_.GetBit(cs)) {
|
|
2085
|
+
leaf_index_ = SparseLeafIndex(cs);
|
|
2086
|
+
valid_ = true;
|
|
2087
|
+
return true;
|
|
2088
|
+
}
|
|
2089
|
+
return DescendToLeftmostLeaf(false,
|
|
2090
|
+
SparseChildNodeNum(cs));
|
|
2091
|
+
}
|
|
2092
|
+
if (cmp > 0) {
|
|
2093
|
+
// Target > chain prefix at some point. We need to
|
|
2094
|
+
// find the exact divergence point.
|
|
2095
|
+
uint16_t mismatch_pos = 0;
|
|
2096
|
+
while (mismatch_pos < target_remaining &&
|
|
2097
|
+
target_bytes[mismatch_pos] ==
|
|
2098
|
+
suffix[mismatch_pos]) {
|
|
2099
|
+
mismatch_pos++;
|
|
2100
|
+
}
|
|
2101
|
+
// Push path entries up to the mismatch.
|
|
2102
|
+
uint64_t cur_idx = child_idx;
|
|
2103
|
+
for (uint16_t ci = 0; ci < mismatch_pos; ci++) {
|
|
2104
|
+
uint32_t cs2 = trie_->s_child_start_pos_[cur_idx];
|
|
2105
|
+
path_.push_back(LevelPos::MakeSparse(cs2));
|
|
2106
|
+
AppendKeySlot() =
|
|
2107
|
+
static_cast<char>(trie_->s_labels_data_[cs2]);
|
|
2108
|
+
cur_idx = trie_->s_has_child_.Rank1(cs2 + 1) - 1;
|
|
2109
|
+
}
|
|
2110
|
+
uint32_t mis_cs2 = trie_->s_child_start_pos_[cur_idx];
|
|
2111
|
+
path_.push_back(LevelPos::MakeSparse(mis_cs2));
|
|
2112
|
+
AppendKeySlot() =
|
|
2113
|
+
static_cast<char>(trie_->s_labels_data_[mis_cs2]);
|
|
2114
|
+
return Advance();
|
|
2115
|
+
}
|
|
2116
|
+
// cmp == 0: target matches chain prefix exactly. Target
|
|
2117
|
+
// is fully consumed. Push matched nodes and check
|
|
2118
|
+
// prefix key / descend to leftmost leaf.
|
|
2119
|
+
uint64_t cur_idx = child_idx;
|
|
2120
|
+
for (uint32_t ci = 0; ci < target_remaining; ci++) {
|
|
2121
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
2122
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2123
|
+
AppendKeySlot() =
|
|
2124
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2125
|
+
if (ci + 1 < target_remaining) {
|
|
2126
|
+
cur_idx = trie_->s_has_child_.Rank1(cs + 1) - 1;
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2129
|
+
// Target consumed mid-chain. The remaining chain nodes
|
|
2130
|
+
// form keys > target. Descend to leftmost leaf from the
|
|
2131
|
+
// next chain node.
|
|
2132
|
+
uint32_t last_cs = trie_->s_child_start_pos_[cur_idx];
|
|
2133
|
+
// This node is always internal (it's mid-chain).
|
|
2134
|
+
return DescendToLeftmostLeaf(false,
|
|
2135
|
+
SparseChildNodeNum(last_cs));
|
|
2136
|
+
}
|
|
2137
|
+
// target_remaining == 0: target fully consumed.
|
|
2138
|
+
// This means the target ended exactly at the parent node.
|
|
2139
|
+
// The chain nodes are all > target. Push first chain node
|
|
2140
|
+
// and descend to leftmost leaf.
|
|
2141
|
+
uint32_t cs = trie_->s_child_start_pos_[child_idx];
|
|
2142
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2143
|
+
AppendKeySlot() =
|
|
2144
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2145
|
+
if (!trie_->s_has_child_.GetBit(cs)) {
|
|
2146
|
+
leaf_index_ = SparseLeafIndex(cs);
|
|
2147
|
+
valid_ = true;
|
|
2148
|
+
return true;
|
|
2149
|
+
}
|
|
2150
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(cs));
|
|
2151
|
+
}
|
|
2152
|
+
} // if constexpr (kHasChains)
|
|
2153
|
+
// No chain — normal child lookup.
|
|
2154
|
+
sparse_start = trie_->s_child_start_pos_[child_idx];
|
|
2155
|
+
sparse_end = trie_->s_child_end_pos_[child_idx];
|
|
2156
|
+
have_sparse_bounds = true;
|
|
2157
|
+
} else {
|
|
2158
|
+
node_num = SparseChildNodeNum(start);
|
|
2159
|
+
have_sparse_bounds = false;
|
|
2160
|
+
}
|
|
2161
|
+
} else {
|
|
2162
|
+
node_num = SparseChildNodeNum(start);
|
|
2163
|
+
have_sparse_bounds = false;
|
|
2164
|
+
}
|
|
2165
|
+
in_dense = false;
|
|
2166
|
+
continue;
|
|
2167
|
+
}
|
|
2168
|
+
// Leaf node.
|
|
2169
|
+
if (depth == static_cast<uint32_t>(target.size()) - 1) {
|
|
2170
|
+
leaf_index_ =
|
|
2171
|
+
SparseLeafIndexFromHasChildRank(start, has_child_rank);
|
|
2172
|
+
valid_ = true;
|
|
2173
|
+
return true;
|
|
2174
|
+
}
|
|
2175
|
+
leaf_index_ = SparseLeafIndexFromHasChildRank(start, has_child_rank);
|
|
2176
|
+
valid_ = true;
|
|
2177
|
+
return Advance();
|
|
2178
|
+
}
|
|
2179
|
+
|
|
2180
|
+
// label != target_byte. Still need to push path for backtracking.
|
|
2181
|
+
path_.push_back(LevelPos::MakeSparse(start));
|
|
2182
|
+
AppendKeySlot() = static_cast<char>(label);
|
|
2183
|
+
|
|
2184
|
+
if (label > target_byte) {
|
|
2185
|
+
// Label is greater: go to leftmost leaf in subtree.
|
|
2186
|
+
bool is_internal = trie_->s_has_child_.GetBit(start);
|
|
2187
|
+
uint64_t has_child_rank = trie_->s_has_child_.Rank1(start + 1);
|
|
2188
|
+
if (!is_internal) {
|
|
2189
|
+
leaf_index_ =
|
|
2190
|
+
SparseLeafIndexFromHasChildRank(start, has_child_rank);
|
|
2191
|
+
valid_ = true;
|
|
2192
|
+
return true;
|
|
2193
|
+
}
|
|
2194
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(start));
|
|
2195
|
+
}
|
|
2196
|
+
// label < target_byte: no label >= target in this node. Backtrack.
|
|
2197
|
+
return Advance();
|
|
2198
|
+
}
|
|
2199
|
+
|
|
2200
|
+
// General path for nodes with fanout > 1.
|
|
2201
|
+
uint64_t pos;
|
|
2202
|
+
bool exact = SparseSeekLabel(start, end, target_byte, &pos);
|
|
2203
|
+
|
|
2204
|
+
if (pos >= end) {
|
|
2205
|
+
return Advance();
|
|
2206
|
+
}
|
|
2207
|
+
|
|
2208
|
+
path_.push_back(LevelPos::MakeSparse(pos));
|
|
2209
|
+
AppendKeySlot() = static_cast<char>(trie_->s_labels_data_[pos]);
|
|
2210
|
+
|
|
2211
|
+
bool is_internal = trie_->s_has_child_.GetBit(pos);
|
|
2212
|
+
uint64_t has_child_rank = trie_->s_has_child_.Rank1(pos + 1);
|
|
2213
|
+
|
|
2214
|
+
if (!exact) {
|
|
2215
|
+
if (!is_internal) {
|
|
2216
|
+
leaf_index_ = SparseLeafIndexFromHasChildRank(pos, has_child_rank);
|
|
2217
|
+
valid_ = true;
|
|
2218
|
+
return true;
|
|
2219
|
+
}
|
|
2220
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(pos));
|
|
2221
|
+
}
|
|
2222
|
+
|
|
2223
|
+
if (!is_internal) {
|
|
2224
|
+
// Check if target is fully consumed.
|
|
2225
|
+
if (depth == static_cast<uint32_t>(target.size()) - 1) {
|
|
2226
|
+
leaf_index_ = SparseLeafIndexFromHasChildRank(pos, has_child_rank);
|
|
2227
|
+
valid_ = true;
|
|
2228
|
+
return true;
|
|
2229
|
+
}
|
|
2230
|
+
// Target has more bytes. Trie key < target. Advance.
|
|
2231
|
+
leaf_index_ = SparseLeafIndexFromHasChildRank(pos, has_child_rank);
|
|
2232
|
+
valid_ = true;
|
|
2233
|
+
return Advance();
|
|
2234
|
+
}
|
|
2235
|
+
|
|
2236
|
+
// Descend to child: Get child bounds using Rank1 + array lookup.
|
|
2237
|
+
// This is the key SuRF optimization - NO FindNthOneBit here!
|
|
2238
|
+
// Reuse the already-computed has_child_rank.
|
|
2239
|
+
{
|
|
2240
|
+
uint64_t child_idx = has_child_rank - 1;
|
|
2241
|
+
if (!trie_->s_child_start_pos_.empty() &&
|
|
2242
|
+
child_idx < trie_->s_child_start_pos_.size()) {
|
|
2243
|
+
// ---- Path compression: chain skip (general path) ----
|
|
2244
|
+
if constexpr (kHasChains) {
|
|
2245
|
+
if (child_idx < trie_->s_chain_bitmap_.NumBits() &&
|
|
2246
|
+
trie_->s_chain_bitmap_.GetBit(child_idx)) {
|
|
2247
|
+
uint64_t chain_idx =
|
|
2248
|
+
trie_->s_chain_bitmap_.Rank1(child_idx + 1) - 1;
|
|
2249
|
+
uint16_t chain_len = trie_->s_chain_lens_[chain_idx];
|
|
2250
|
+
const uint8_t* suffix = trie_->s_chain_suffix_data_ +
|
|
2251
|
+
trie_->s_chain_suffix_offsets_[chain_idx];
|
|
2252
|
+
uint32_t target_remaining =
|
|
2253
|
+
static_cast<uint32_t>(target.size()) - depth - 1;
|
|
2254
|
+
|
|
2255
|
+
if (target_remaining >= chain_len) {
|
|
2256
|
+
const uint8_t* target_bytes =
|
|
2257
|
+
reinterpret_cast<const uint8_t*>(target.data()) + depth + 1;
|
|
2258
|
+
int cmp = memcmp(target_bytes, suffix, chain_len);
|
|
2259
|
+
if (cmp == 0) {
|
|
2260
|
+
// Full chain match! Push all chain nodes.
|
|
2261
|
+
uint64_t cur_idx = child_idx;
|
|
2262
|
+
for (uint16_t ci = 0; ci < chain_len; ci++) {
|
|
2263
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
2264
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2265
|
+
AppendKeySlot() =
|
|
2266
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2267
|
+
if (ci + 1 < chain_len) {
|
|
2268
|
+
cur_idx = trie_->s_has_child_.Rank1(cs + 1) - 1;
|
|
2269
|
+
}
|
|
2270
|
+
}
|
|
2271
|
+
depth += chain_len;
|
|
2272
|
+
|
|
2273
|
+
uint32_t end_child_idx =
|
|
2274
|
+
trie_->s_chain_end_child_idx_[chain_idx];
|
|
2275
|
+
if (end_child_idx == UINT32_MAX) {
|
|
2276
|
+
uint32_t last_cs = trie_->s_child_start_pos_[cur_idx];
|
|
2277
|
+
uint64_t last_hcr = trie_->s_has_child_.Rank1(last_cs + 1);
|
|
2278
|
+
if (depth == static_cast<uint32_t>(target.size()) - 1) {
|
|
2279
|
+
leaf_index_ =
|
|
2280
|
+
SparseLeafIndexFromHasChildRank(last_cs, last_hcr);
|
|
2281
|
+
valid_ = true;
|
|
2282
|
+
return true;
|
|
2283
|
+
}
|
|
2284
|
+
leaf_index_ =
|
|
2285
|
+
SparseLeafIndexFromHasChildRank(last_cs, last_hcr);
|
|
2286
|
+
valid_ = true;
|
|
2287
|
+
return Advance();
|
|
2288
|
+
}
|
|
2289
|
+
sparse_start = trie_->s_child_start_pos_[end_child_idx];
|
|
2290
|
+
sparse_end = trie_->s_child_end_pos_[end_child_idx];
|
|
2291
|
+
have_sparse_bounds = true;
|
|
2292
|
+
in_dense = false;
|
|
2293
|
+
continue;
|
|
2294
|
+
}
|
|
2295
|
+
// Mismatch: find divergence point.
|
|
2296
|
+
uint16_t mismatch_pos = 0;
|
|
2297
|
+
while (mismatch_pos < chain_len &&
|
|
2298
|
+
target_bytes[mismatch_pos] == suffix[mismatch_pos]) {
|
|
2299
|
+
mismatch_pos++;
|
|
2300
|
+
}
|
|
2301
|
+
uint64_t cur_idx = child_idx;
|
|
2302
|
+
for (uint16_t ci = 0; ci < mismatch_pos; ci++) {
|
|
2303
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
2304
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2305
|
+
AppendKeySlot() =
|
|
2306
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2307
|
+
cur_idx = trie_->s_has_child_.Rank1(cs + 1) - 1;
|
|
2308
|
+
}
|
|
2309
|
+
uint32_t mis_cs = trie_->s_child_start_pos_[cur_idx];
|
|
2310
|
+
path_.push_back(LevelPos::MakeSparse(mis_cs));
|
|
2311
|
+
AppendKeySlot() =
|
|
2312
|
+
static_cast<char>(trie_->s_labels_data_[mis_cs]);
|
|
2313
|
+
|
|
2314
|
+
if (target_bytes[mismatch_pos] < suffix[mismatch_pos]) {
|
|
2315
|
+
if (!trie_->s_has_child_.GetBit(mis_cs)) {
|
|
2316
|
+
leaf_index_ = SparseLeafIndex(mis_cs);
|
|
2317
|
+
valid_ = true;
|
|
2318
|
+
return true;
|
|
2319
|
+
}
|
|
2320
|
+
return DescendToLeftmostLeaf(false,
|
|
2321
|
+
SparseChildNodeNum(mis_cs));
|
|
2322
|
+
}
|
|
2323
|
+
return Advance();
|
|
2324
|
+
}
|
|
2325
|
+
// Target runs out before chain ends.
|
|
2326
|
+
if (target_remaining > 0) {
|
|
2327
|
+
const uint8_t* target_bytes =
|
|
2328
|
+
reinterpret_cast<const uint8_t*>(target.data()) + depth + 1;
|
|
2329
|
+
int cmp = memcmp(target_bytes, suffix, target_remaining);
|
|
2330
|
+
if (cmp < 0) {
|
|
2331
|
+
uint32_t cs = trie_->s_child_start_pos_[child_idx];
|
|
2332
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2333
|
+
AppendKeySlot() =
|
|
2334
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2335
|
+
if (!trie_->s_has_child_.GetBit(cs)) {
|
|
2336
|
+
leaf_index_ = SparseLeafIndex(cs);
|
|
2337
|
+
valid_ = true;
|
|
2338
|
+
return true;
|
|
2339
|
+
}
|
|
2340
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(cs));
|
|
2341
|
+
}
|
|
2342
|
+
if (cmp > 0) {
|
|
2343
|
+
uint16_t mp = 0;
|
|
2344
|
+
while (mp < target_remaining &&
|
|
2345
|
+
target_bytes[mp] == suffix[mp]) {
|
|
2346
|
+
mp++;
|
|
2347
|
+
}
|
|
2348
|
+
uint64_t cur_idx = child_idx;
|
|
2349
|
+
for (uint16_t ci = 0; ci < mp; ci++) {
|
|
2350
|
+
uint32_t cs2 = trie_->s_child_start_pos_[cur_idx];
|
|
2351
|
+
path_.push_back(LevelPos::MakeSparse(cs2));
|
|
2352
|
+
AppendKeySlot() =
|
|
2353
|
+
static_cast<char>(trie_->s_labels_data_[cs2]);
|
|
2354
|
+
cur_idx = trie_->s_has_child_.Rank1(cs2 + 1) - 1;
|
|
2355
|
+
}
|
|
2356
|
+
uint32_t mis_cs2 = trie_->s_child_start_pos_[cur_idx];
|
|
2357
|
+
path_.push_back(LevelPos::MakeSparse(mis_cs2));
|
|
2358
|
+
AppendKeySlot() =
|
|
2359
|
+
static_cast<char>(trie_->s_labels_data_[mis_cs2]);
|
|
2360
|
+
return Advance();
|
|
2361
|
+
}
|
|
2362
|
+
// Prefix match: target consumed mid-chain.
|
|
2363
|
+
uint64_t cur_idx = child_idx;
|
|
2364
|
+
for (uint32_t ci = 0; ci < target_remaining; ci++) {
|
|
2365
|
+
uint32_t cs = trie_->s_child_start_pos_[cur_idx];
|
|
2366
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2367
|
+
AppendKeySlot() =
|
|
2368
|
+
static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2369
|
+
if (ci + 1 < target_remaining) {
|
|
2370
|
+
cur_idx = trie_->s_has_child_.Rank1(cs + 1) - 1;
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
uint32_t last_cs = trie_->s_child_start_pos_[cur_idx];
|
|
2374
|
+
return DescendToLeftmostLeaf(false,
|
|
2375
|
+
SparseChildNodeNum(last_cs));
|
|
2376
|
+
}
|
|
2377
|
+
// target_remaining == 0: first chain node is > target.
|
|
2378
|
+
uint32_t cs = trie_->s_child_start_pos_[child_idx];
|
|
2379
|
+
path_.push_back(LevelPos::MakeSparse(cs));
|
|
2380
|
+
AppendKeySlot() = static_cast<char>(trie_->s_labels_data_[cs]);
|
|
2381
|
+
if (!trie_->s_has_child_.GetBit(cs)) {
|
|
2382
|
+
leaf_index_ = SparseLeafIndex(cs);
|
|
2383
|
+
valid_ = true;
|
|
2384
|
+
return true;
|
|
2385
|
+
}
|
|
2386
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(cs));
|
|
2387
|
+
}
|
|
2388
|
+
} // if constexpr (kHasChains)
|
|
2389
|
+
// No chain — normal child lookup.
|
|
2390
|
+
sparse_start = trie_->s_child_start_pos_[child_idx];
|
|
2391
|
+
sparse_end = trie_->s_child_end_pos_[child_idx];
|
|
2392
|
+
have_sparse_bounds = true;
|
|
2393
|
+
} else {
|
|
2394
|
+
node_num = SparseChildNodeNum(pos);
|
|
2395
|
+
have_sparse_bounds = false;
|
|
2396
|
+
}
|
|
2397
|
+
}
|
|
2398
|
+
in_dense = false;
|
|
2399
|
+
}
|
|
2400
|
+
}
|
|
2401
|
+
|
|
2402
|
+
// Target key fully consumed. Check if current node is a prefix key.
|
|
2403
|
+
if (in_dense) {
|
|
2404
|
+
if (trie_->d_is_prefix_key_.NumBits() > 0 &&
|
|
2405
|
+
node_num < trie_->d_is_prefix_key_.NumBits() &&
|
|
2406
|
+
trie_->d_is_prefix_key_.GetBit(node_num)) {
|
|
2407
|
+
is_at_prefix_key_ = true;
|
|
2408
|
+
leaf_index_ = DensePrefixKeyLeafIndex(node_num);
|
|
2409
|
+
valid_ = true;
|
|
2410
|
+
return true;
|
|
2411
|
+
}
|
|
2412
|
+
} else {
|
|
2413
|
+
// For prefix key check, we need node_num. Compute it if we were tracking
|
|
2414
|
+
// positions directly.
|
|
2415
|
+
if (have_sparse_bounds) {
|
|
2416
|
+
// Compute node_num from sparse_start position.
|
|
2417
|
+
// node_num = number of nodes before this position = Rank1(s_louds, pos)
|
|
2418
|
+
// But sparse_start IS the start of the node, so:
|
|
2419
|
+
// node_num = Rank1(s_louds_, sparse_start + 1) - 1
|
|
2420
|
+
node_num = trie_->s_louds_.Rank1(sparse_start + 1) - 1;
|
|
2421
|
+
}
|
|
2422
|
+
if (trie_->s_is_prefix_key_.NumBits() > 0 &&
|
|
2423
|
+
node_num < trie_->s_is_prefix_key_.NumBits() &&
|
|
2424
|
+
trie_->s_is_prefix_key_.GetBit(node_num)) {
|
|
2425
|
+
is_at_prefix_key_ = true;
|
|
2426
|
+
leaf_index_ = SparsePrefixKeyLeafIndex(node_num);
|
|
2427
|
+
valid_ = true;
|
|
2428
|
+
return true;
|
|
2429
|
+
}
|
|
2430
|
+
}
|
|
2431
|
+
|
|
2432
|
+
// Descend to leftmost leaf.
|
|
2433
|
+
return DescendToLeftmostLeaf(in_dense, node_num);
|
|
2434
|
+
}
|
|
2435
|
+
|
|
2436
|
+
bool LoudsTrieIterator::Next() {
|
|
2437
|
+
if (!valid_) {
|
|
2438
|
+
return false;
|
|
2439
|
+
}
|
|
2440
|
+
|
|
2441
|
+
if (is_at_prefix_key_) {
|
|
2442
|
+
is_at_prefix_key_ = false;
|
|
2443
|
+
// The prefix key is at a node that also has children. The next leaf
|
|
2444
|
+
// is the leftmost child leaf.
|
|
2445
|
+
bool in_dense;
|
|
2446
|
+
uint64_t node_num;
|
|
2447
|
+
if (path_.empty()) {
|
|
2448
|
+
// Root is the prefix key.
|
|
2449
|
+
in_dense = (trie_->cutoff_level_ > 0);
|
|
2450
|
+
node_num = 0;
|
|
2451
|
+
} else {
|
|
2452
|
+
// The last path entry is the label that leads TO this prefix key node.
|
|
2453
|
+
// The prefix key node IS the child of that label.
|
|
2454
|
+
auto last = path_.back();
|
|
2455
|
+
if (last.is_dense()) {
|
|
2456
|
+
// Compute label_rank once and reuse for child node lookup.
|
|
2457
|
+
uint64_t lr = trie_->d_labels_.Rank1(last.pos() + 1) - 1;
|
|
2458
|
+
uint64_t child = DenseChildNodeNumFromRank(lr);
|
|
2459
|
+
in_dense = (child < trie_->dense_node_count_);
|
|
2460
|
+
node_num = in_dense ? child : child - trie_->dense_node_count_;
|
|
2461
|
+
} else {
|
|
2462
|
+
node_num = SparseChildNodeNum(last.pos());
|
|
2463
|
+
in_dense = false;
|
|
2464
|
+
}
|
|
2465
|
+
}
|
|
2466
|
+
|
|
2467
|
+
// Find leftmost child leaf (NOT checking prefix key again, since we
|
|
2468
|
+
// just came from it).
|
|
2469
|
+
if (in_dense) {
|
|
2470
|
+
uint64_t base = node_num * 256;
|
|
2471
|
+
if (base >= trie_->d_labels_.NumBits()) {
|
|
2472
|
+
return Advance();
|
|
2473
|
+
}
|
|
2474
|
+
uint64_t first = trie_->d_labels_.NextSetBit(base);
|
|
2475
|
+
if (first >= base + 256 || first >= trie_->d_labels_.NumBits()) {
|
|
2476
|
+
return Advance();
|
|
2477
|
+
}
|
|
2478
|
+
path_.push_back(LevelPos::MakeDense(first));
|
|
2479
|
+
AppendKeySlot() = static_cast<char>(first % 256);
|
|
2480
|
+
|
|
2481
|
+
uint64_t label_rank = trie_->d_labels_.Rank1(first + 1) - 1;
|
|
2482
|
+
if (!trie_->d_has_child_.GetBit(label_rank)) {
|
|
2483
|
+
leaf_index_ = DenseLeafIndexFromRank(first, label_rank);
|
|
2484
|
+
valid_ = true;
|
|
2485
|
+
return true;
|
|
2486
|
+
}
|
|
2487
|
+
uint64_t child = DenseChildNodeNumFromRank(label_rank);
|
|
2488
|
+
bool cd = (child < trie_->dense_node_count_);
|
|
2489
|
+
return DescendToLeftmostLeaf(
|
|
2490
|
+
cd, cd ? child : child - trie_->dense_node_count_);
|
|
2491
|
+
} else {
|
|
2492
|
+
uint64_t start = SparseNodeStartPos(node_num);
|
|
2493
|
+
if (start >= trie_->s_labels_size_) {
|
|
2494
|
+
return Advance();
|
|
2495
|
+
}
|
|
2496
|
+
path_.push_back(LevelPos::MakeSparse(start));
|
|
2497
|
+
AppendKeySlot() = static_cast<char>(trie_->s_labels_data_[start]);
|
|
2498
|
+
|
|
2499
|
+
if (!trie_->s_has_child_.GetBit(start)) {
|
|
2500
|
+
leaf_index_ = SparseLeafIndex(start);
|
|
2501
|
+
valid_ = true;
|
|
2502
|
+
return true;
|
|
2503
|
+
}
|
|
2504
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(start));
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
2507
|
+
|
|
2508
|
+
return Advance();
|
|
2509
|
+
}
|
|
2510
|
+
|
|
2511
|
+
bool LoudsTrieIterator::Advance() {
|
|
2512
|
+
// Backtrack up the path to find the next sibling, then descend to the
|
|
2513
|
+
// leftmost leaf in that subtree.
|
|
2514
|
+
while (!path_.empty()) {
|
|
2515
|
+
auto cur = path_.back();
|
|
2516
|
+
path_.pop_back();
|
|
2517
|
+
if (key_len_ > 0) {
|
|
2518
|
+
key_len_--;
|
|
2519
|
+
}
|
|
2520
|
+
|
|
2521
|
+
if (cur.is_dense()) {
|
|
2522
|
+
uint64_t cur_pos = cur.pos();
|
|
2523
|
+
uint64_t node_num = cur_pos / 256;
|
|
2524
|
+
uint64_t node_end = (node_num + 1) * 256;
|
|
2525
|
+
uint64_t next = trie_->d_labels_.NextSetBit(cur_pos + 1);
|
|
2526
|
+
|
|
2527
|
+
if (next < node_end && next < trie_->d_labels_.NumBits()) {
|
|
2528
|
+
path_.push_back(LevelPos::MakeDense(next));
|
|
2529
|
+
AppendKeySlot() = static_cast<char>(next % 256);
|
|
2530
|
+
|
|
2531
|
+
uint64_t label_rank = trie_->d_labels_.Rank1(next + 1) - 1;
|
|
2532
|
+
if (!trie_->d_has_child_.GetBit(label_rank)) {
|
|
2533
|
+
leaf_index_ = DenseLeafIndexFromRank(next, label_rank);
|
|
2534
|
+
valid_ = true;
|
|
2535
|
+
return true;
|
|
2536
|
+
}
|
|
2537
|
+
|
|
2538
|
+
uint64_t child = DenseChildNodeNumFromRank(label_rank);
|
|
2539
|
+
bool cd = (child < trie_->dense_node_count_);
|
|
2540
|
+
return DescendToLeftmostLeaf(
|
|
2541
|
+
cd, cd ? child : child - trie_->dense_node_count_);
|
|
2542
|
+
}
|
|
2543
|
+
} else {
|
|
2544
|
+
uint64_t next_pos = cur.pos() + 1;
|
|
2545
|
+
if (next_pos < trie_->s_labels_size_ &&
|
|
2546
|
+
!trie_->s_louds_.GetBit(next_pos)) {
|
|
2547
|
+
path_.push_back(LevelPos::MakeSparse(next_pos));
|
|
2548
|
+
AppendKeySlot() = static_cast<char>(trie_->s_labels_data_[next_pos]);
|
|
2549
|
+
|
|
2550
|
+
if (!trie_->s_has_child_.GetBit(next_pos)) {
|
|
2551
|
+
leaf_index_ = SparseLeafIndex(next_pos);
|
|
2552
|
+
valid_ = true;
|
|
2553
|
+
return true;
|
|
2554
|
+
}
|
|
2555
|
+
|
|
2556
|
+
return DescendToLeftmostLeaf(false, SparseChildNodeNum(next_pos));
|
|
2557
|
+
}
|
|
2558
|
+
}
|
|
2559
|
+
}
|
|
2560
|
+
|
|
2561
|
+
valid_ = false;
|
|
2562
|
+
return false;
|
|
2563
|
+
}
|
|
2564
|
+
|
|
2565
|
+
TrieBlockHandle LoudsTrieIterator::Value() const {
|
|
2566
|
+
assert(valid_);
|
|
2567
|
+
return trie_->GetHandle(leaf_index_);
|
|
2568
|
+
}
|
|
2569
|
+
|
|
2570
|
+
// Explicit template instantiations for SeekImpl.
|
|
2571
|
+
template bool LoudsTrieIterator::SeekImpl<true>(const Slice&);
|
|
2572
|
+
template bool LoudsTrieIterator::SeekImpl<false>(const Slice&);
|
|
2573
|
+
|
|
2574
|
+
} // namespace trie_index
|
|
2575
|
+
} // namespace ROCKSDB_NAMESPACE
|