@nxtedition/rocksdb 5.2.21 → 5.2.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +216 -252
- package/binding.gyp +78 -72
- package/deps/rocksdb/build_version.cc +70 -4
- package/deps/rocksdb/rocksdb/CMakeLists.txt +281 -149
- package/deps/rocksdb/rocksdb/Makefile +459 -469
- package/deps/rocksdb/rocksdb/README.md +4 -4
- package/deps/rocksdb/rocksdb/TARGETS +5244 -1500
- package/deps/rocksdb/rocksdb/cache/cache.cc +12 -3
- package/deps/rocksdb/rocksdb/cache/cache_bench.cc +7 -368
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +924 -0
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +128 -0
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.h +103 -0
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +183 -0
- package/deps/rocksdb/rocksdb/cache/cache_helpers.h +11 -0
- package/deps/rocksdb/rocksdb/cache/cache_key.cc +344 -0
- package/deps/rocksdb/rocksdb/cache/cache_key.h +132 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +183 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +288 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +468 -0
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +85 -8
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +121 -51
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +171 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +86 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +607 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +381 -154
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +176 -33
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1659 -3
- package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +94 -23
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +49 -28
- package/deps/rocksdb/rocksdb/crash_test.mk +93 -0
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +54 -31
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +10 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +146 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc +326 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.cc +34 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.h +37 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_addition.cc +4 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc +8 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +99 -40
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +20 -8
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +95 -83
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +13 -10
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +7 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +37 -37
- package/deps/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h +101 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.cc +8 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +6 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +209 -44
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +37 -11
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +382 -179
- package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc +100 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.h +102 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc +196 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_index.h +3 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_log_format.h +2 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +7 -5
- package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h +10 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +12 -8
- package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.h +5 -5
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +772 -9
- package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +730 -0
- package/deps/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc +82 -0
- package/deps/rocksdb/rocksdb/db/blob/db_blob_index_test.cc +155 -17
- package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc +21 -0
- package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h +38 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +137 -89
- package/deps/rocksdb/rocksdb/db/builder.h +16 -37
- package/deps/rocksdb/rocksdb/db/c.cc +413 -208
- package/deps/rocksdb/rocksdb/db/c_test.c +227 -138
- package/deps/rocksdb/rocksdb/db/column_family.cc +118 -103
- package/deps/rocksdb/rocksdb/db/column_family.h +86 -44
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +38 -24
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +81 -0
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +275 -0
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc +258 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +81 -28
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +43 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h +12 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +406 -215
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +147 -50
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +167 -61
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +1321 -156
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +197 -28
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +246 -43
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +65 -26
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +7 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +122 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +18 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +536 -44
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +311 -30
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +849 -0
- package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +92 -0
- package/deps/rocksdb/rocksdb/db/compaction/sst_partitioner.cc +46 -0
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/convenience.cc +6 -3
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +383 -28
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +7 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +154 -45
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +1095 -33
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +1249 -203
- package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +135 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +1348 -166
- package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +3 -5
- package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +312 -45
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +1734 -48
- package/deps/rocksdb/rocksdb/db/{compacted_db_impl.cc → db_impl/compacted_db_impl.cc} +24 -7
- package/deps/rocksdb/rocksdb/db/{compacted_db_impl.h → db_impl/compacted_db_impl.h} +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +644 -333
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +365 -92
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +578 -210
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +38 -16
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +17 -10
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +75 -74
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +450 -183
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +42 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +232 -15
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +42 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +297 -100
- package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +16 -15
- package/deps/rocksdb/rocksdb/db/db_inplace_update_test.cc +31 -1
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +6 -5
- package/deps/rocksdb/rocksdb/db/db_iter.cc +218 -153
- package/deps/rocksdb/rocksdb/db/db_iter.h +14 -12
- package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_iter_test.cc +84 -160
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +47 -6
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +204 -0
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +21 -13
- package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +17 -10
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +38 -24
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +184 -19
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +183 -3
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +409 -9
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +92 -23
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +446 -0
- package/deps/rocksdb/rocksdb/db/{db_impl/db_secondary_test.cc → db_secondary_test.cc} +363 -35
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +520 -15
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +50 -1
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +139 -4
- package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_test.cc +669 -359
- package/deps/rocksdb/rocksdb/db/db_test2.cc +2110 -304
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +76 -43
- package/deps/rocksdb/rocksdb/db/db_test_util.h +231 -103
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +19 -11
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +490 -71
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +980 -349
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +11 -12
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +793 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/dbformat.cc +4 -12
- package/deps/rocksdb/rocksdb/db/dbformat.h +28 -18
- package/deps/rocksdb/rocksdb/db/dbformat_test.cc +3 -0
- package/deps/rocksdb/rocksdb/db/deletefile_test.cc +50 -15
- package/deps/rocksdb/rocksdb/db/error_handler.cc +127 -41
- package/deps/rocksdb/rocksdb/db/error_handler.h +12 -5
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +524 -255
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +136 -11
- package/deps/rocksdb/rocksdb/db/event_helpers.h +27 -2
- package/deps/rocksdb/rocksdb/db/experimental.cc +100 -0
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +307 -4
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +137 -60
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +12 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -55
- package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +86 -5
- package/deps/rocksdb/rocksdb/db/filename_test.cc +63 -0
- package/deps/rocksdb/rocksdb/db/flush_job.cc +619 -64
- package/deps/rocksdb/rocksdb/db/flush_job.h +30 -7
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +33 -16
- package/deps/rocksdb/rocksdb/db/flush_scheduler.h +2 -1
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +18 -17
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +5 -4
- package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +0 -1
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +91 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +25 -14
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +6 -5
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +471 -50
- package/deps/rocksdb/rocksdb/db/internal_stats.h +129 -25
- package/deps/rocksdb/rocksdb/db/job_context.h +22 -9
- package/deps/rocksdb/rocksdb/db/kv_checksum.h +394 -0
- package/deps/rocksdb/rocksdb/db/listener_test.cc +518 -41
- package/deps/rocksdb/rocksdb/db/log_format.h +4 -1
- package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -6
- package/deps/rocksdb/rocksdb/db/log_reader.h +17 -1
- package/deps/rocksdb/rocksdb/db/log_test.cc +161 -11
- package/deps/rocksdb/rocksdb/db/log_writer.cc +92 -13
- package/deps/rocksdb/rocksdb/db/log_writer.h +18 -5
- package/deps/rocksdb/rocksdb/db/logs_with_prep_tracker.h +1 -1
- package/deps/rocksdb/rocksdb/db/lookup_key.h +0 -1
- package/deps/rocksdb/rocksdb/db/malloc_stats.cc +2 -2
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +21 -8
- package/deps/rocksdb/rocksdb/db/memtable.cc +144 -54
- package/deps/rocksdb/rocksdb/db/memtable.h +72 -15
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +95 -47
- package/deps/rocksdb/rocksdb/db/memtable_list.h +33 -13
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +61 -31
- package/deps/rocksdb/rocksdb/db/merge_context.h +20 -8
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +54 -11
- package/deps/rocksdb/rocksdb/db/merge_helper.h +17 -6
- package/deps/rocksdb/rocksdb/db/merge_helper_test.cc +13 -7
- package/deps/rocksdb/rocksdb/db/merge_test.cc +40 -19
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +14 -25
- package/deps/rocksdb/rocksdb/db/output_validator.cc +3 -0
- package/deps/rocksdb/rocksdb/db/output_validator.h +5 -4
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +32 -28
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +43 -29
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +9 -7
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +21 -16
- package/deps/rocksdb/rocksdb/db/pinned_iterators_manager.h +1 -1
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +29 -36
- package/deps/rocksdb/rocksdb/db/pre_release_callback.h +1 -2
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +2 -2
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_bench.cc +11 -11
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +3 -2
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +14 -8
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +17 -0
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc +4 -2
- package/deps/rocksdb/rocksdb/db/read_callback.h +1 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +87 -58
- package/deps/rocksdb/rocksdb/db/repair_test.cc +35 -5
- package/deps/rocksdb/rocksdb/db/snapshot_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +95 -69
- package/deps/rocksdb/rocksdb/db/table_cache.h +63 -53
- package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +4 -4
- package/deps/rocksdb/rocksdb/db/table_properties_collector.h +78 -10
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +28 -33
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +30 -51
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +12 -8
- package/deps/rocksdb/rocksdb/db/version_builder.cc +564 -341
- package/deps/rocksdb/rocksdb/db/version_builder.h +8 -8
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +327 -155
- package/deps/rocksdb/rocksdb/db/version_edit.cc +89 -27
- package/deps/rocksdb/rocksdb/db/version_edit.h +42 -17
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +324 -43
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +79 -22
- package/deps/rocksdb/rocksdb/db/version_edit_test.cc +165 -20
- package/deps/rocksdb/rocksdb/db/version_set.cc +935 -1034
- package/deps/rocksdb/rocksdb/db/version_set.h +183 -122
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +556 -138
- package/deps/rocksdb/rocksdb/db/version_util.h +68 -0
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +23 -21
- package/deps/rocksdb/rocksdb/db/wal_manager.h +5 -2
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +30 -27
- package/deps/rocksdb/rocksdb/db/write_batch.cc +704 -209
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +135 -2
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +209 -5
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/write_controller.cc +47 -54
- package/deps/rocksdb/rocksdb/db/write_controller.h +12 -9
- package/deps/rocksdb/rocksdb/db/write_controller_test.cc +215 -103
- package/deps/rocksdb/rocksdb/db/write_thread.cc +11 -0
- package/deps/rocksdb/rocksdb/db/write_thread.h +14 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +10 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +78 -25
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +13 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +29 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +199 -32
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc +188 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +59 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +77 -109
- package/deps/rocksdb/rocksdb/{third-party/folly/folly/synchronization/WaitOptions.cpp → db_stress_tool/db_stress_stat.cc} +9 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +7 -6
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +699 -143
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +20 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +49 -39
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +631 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +287 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +1565 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +374 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +149 -18
- package/deps/rocksdb/rocksdb/env/composite_env.cc +464 -0
- package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +98 -646
- package/deps/rocksdb/rocksdb/env/emulated_clock.h +114 -0
- package/deps/rocksdb/rocksdb/env/env.cc +632 -42
- package/deps/rocksdb/rocksdb/env/env_basic_test.cc +84 -36
- package/deps/rocksdb/rocksdb/env/env_chroot.cc +88 -286
- package/deps/rocksdb/rocksdb/env/env_chroot.h +34 -1
- package/deps/rocksdb/rocksdb/env/env_encryption.cc +469 -277
- package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +9 -30
- package/deps/rocksdb/rocksdb/env/env_posix.cc +110 -119
- package/deps/rocksdb/rocksdb/env/env_test.cc +1128 -39
- package/deps/rocksdb/rocksdb/env/file_system.cc +147 -8
- package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +207 -136
- package/deps/rocksdb/rocksdb/env/file_system_tracer.h +86 -54
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +192 -64
- package/deps/rocksdb/rocksdb/env/fs_readonly.h +107 -0
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +339 -0
- package/deps/rocksdb/rocksdb/env/fs_remap.h +139 -0
- package/deps/rocksdb/rocksdb/env/io_posix.cc +245 -41
- package/deps/rocksdb/rocksdb/env/io_posix.h +66 -1
- package/deps/rocksdb/rocksdb/env/mock_env.cc +147 -149
- package/deps/rocksdb/rocksdb/env/mock_env.h +113 -11
- package/deps/rocksdb/rocksdb/env/mock_env_test.cc +2 -4
- package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +164 -0
- package/deps/rocksdb/rocksdb/env/unique_id_gen.h +71 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +9 -5
- package/deps/rocksdb/rocksdb/file/delete_scheduler.h +6 -4
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +19 -12
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +459 -70
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +205 -28
- package/deps/rocksdb/rocksdb/file/file_util.cc +39 -28
- package/deps/rocksdb/rocksdb/file/file_util.h +18 -27
- package/deps/rocksdb/rocksdb/file/filename.cc +59 -22
- package/deps/rocksdb/rocksdb/file/filename.h +13 -8
- package/deps/rocksdb/rocksdb/file/line_file_reader.cc +68 -0
- package/deps/rocksdb/rocksdb/file/line_file_reader.h +59 -0
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +1130 -6
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +220 -36
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +69 -17
- package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +13 -12
- package/deps/rocksdb/rocksdb/file/read_write_util.cc +3 -38
- package/deps/rocksdb/rocksdb/file/read_write_util.h +0 -4
- package/deps/rocksdb/rocksdb/file/readahead_file_info.h +33 -0
- package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +57 -9
- package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +58 -6
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +29 -54
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +22 -29
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +424 -50
- package/deps/rocksdb/rocksdb/file/writable_file_writer.h +66 -19
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +157 -66
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +224 -121
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +333 -30
- package/deps/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/cleanable.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +90 -50
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +13 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +20 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h +8 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +53 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +31 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/customizable.h +102 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +51 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +370 -262
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +286 -87
- package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +124 -64
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +27 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +21 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +384 -41
- package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +111 -143
- package/deps/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h +20 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +56 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +15 -33
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +37 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +314 -26
- package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +11 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +50 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +10 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +186 -96
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +373 -103
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +13 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +37 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +87 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +5 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +59 -30
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +11 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +22 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h +17 -10
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +121 -41
- package/deps/rocksdb/rocksdb/include/rocksdb/stats_history.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +114 -136
- package/deps/rocksdb/rocksdb/include/rocksdb/system_clock.h +116 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +160 -18
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +57 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h +10 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +247 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record_result.h +187 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/transaction_log.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +14 -24
- package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +46 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +14 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/agg_merge.h +138 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +631 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +142 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h +12 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h +368 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +24 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +418 -63
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +143 -73
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h +87 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +43 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +18 -23
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +26 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +32 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +30 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/wal_filter.h +11 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +89 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +11 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +108 -38
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +40 -23
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.h +12 -5
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +100 -49
- package/deps/rocksdb/rocksdb/logging/env_logger.h +7 -5
- package/deps/rocksdb/rocksdb/logging/env_logger_test.cc +0 -1
- package/deps/rocksdb/rocksdb/logging/posix_logger.h +3 -9
- package/deps/rocksdb/rocksdb/memory/arena.cc +3 -1
- package/deps/rocksdb/rocksdb/memory/arena.h +1 -1
- package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +171 -106
- package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +31 -15
- package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc +15 -4
- package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.h +24 -8
- package/deps/rocksdb/rocksdb/memory/memory_allocator.cc +91 -0
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +239 -0
- package/deps/rocksdb/rocksdb/memory/memory_usage.h +14 -1
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +72 -9
- package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc +52 -6
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +53 -0
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +5 -5
- package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +17 -5
- package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -1
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +87 -0
- package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +20 -10
- package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +148 -94
- package/deps/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc +160 -62
- package/deps/rocksdb/rocksdb/microbench/CMakeLists.txt +17 -0
- package/deps/rocksdb/rocksdb/microbench/README.md +60 -0
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +1360 -0
- package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +153 -0
- package/deps/rocksdb/rocksdb/monitoring/histogram.cc +8 -15
- package/deps/rocksdb/rocksdb/monitoring/histogram.h +0 -1
- package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +18 -16
- package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.cc +9 -7
- package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.h +5 -3
- package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +7 -5
- package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +37 -12
- package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +26 -6
- package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +6 -10
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +14 -13
- package/deps/rocksdb/rocksdb/monitoring/perf_context_imp.h +19 -20
- package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +18 -18
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +84 -2
- package/deps/rocksdb/rocksdb/monitoring/statistics.h +6 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics_test.cc +47 -2
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +67 -54
- package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +4 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +2 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +280 -212
- package/deps/rocksdb/rocksdb/options/cf_options.h +51 -57
- package/deps/rocksdb/rocksdb/options/configurable.cc +242 -138
- package/deps/rocksdb/rocksdb/options/configurable_helper.h +4 -68
- package/deps/rocksdb/rocksdb/options/configurable_test.cc +144 -21
- package/deps/rocksdb/rocksdb/options/configurable_test.h +2 -3
- package/deps/rocksdb/rocksdb/options/customizable.cc +67 -7
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +1773 -151
- package/deps/rocksdb/rocksdb/options/db_options.cc +275 -47
- package/deps/rocksdb/rocksdb/options/db_options.h +36 -7
- package/deps/rocksdb/rocksdb/options/options.cc +49 -17
- package/deps/rocksdb/rocksdb/options/options_helper.cc +369 -352
- package/deps/rocksdb/rocksdb/options/options_helper.h +23 -23
- package/deps/rocksdb/rocksdb/options/options_parser.cc +18 -13
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +67 -54
- package/deps/rocksdb/rocksdb/options/options_test.cc +1162 -187
- package/deps/rocksdb/rocksdb/plugin/README.md +43 -0
- package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -1
- package/deps/rocksdb/rocksdb/port/lang.h +52 -0
- package/deps/rocksdb/rocksdb/port/port_example.h +1 -1
- package/deps/rocksdb/rocksdb/port/port_posix.cc +31 -2
- package/deps/rocksdb/rocksdb/port/port_posix.h +20 -2
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +20 -4
- package/deps/rocksdb/rocksdb/port/sys_time.h +2 -2
- package/deps/rocksdb/rocksdb/port/win/env_default.cc +7 -7
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +44 -74
- package/deps/rocksdb/rocksdb/port/win/env_win.h +25 -23
- package/deps/rocksdb/rocksdb/port/win/io_win.cc +32 -34
- package/deps/rocksdb/rocksdb/port/win/io_win.h +12 -6
- package/deps/rocksdb/rocksdb/port/win/port_win.cc +55 -35
- package/deps/rocksdb/rocksdb/port/win/port_win.h +22 -5
- package/deps/rocksdb/rocksdb/port/win/win_logger.cc +3 -3
- package/deps/rocksdb/rocksdb/port/win/win_logger.h +3 -5
- package/deps/rocksdb/rocksdb/port/win/win_thread.cc +7 -1
- package/deps/rocksdb/rocksdb/port/win/win_thread.h +12 -17
- package/deps/rocksdb/rocksdb/python.mk +9 -0
- package/deps/rocksdb/rocksdb/src.mk +82 -34
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -4
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block.cc +158 -80
- package/deps/rocksdb/rocksdb/table/block_based/block.h +64 -36
- package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc +23 -14
- package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.h +13 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc +3 -218
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +603 -328
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +28 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +220 -82
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +8 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +28 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +598 -492
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +151 -96
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +31 -58
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +330 -92
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +50 -19
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +23 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +226 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +56 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +42 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_type.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +34 -20
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +9 -10
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +26 -3
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +844 -202
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +281 -81
- package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +62 -2
- package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.h +2 -3
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +28 -7
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +22 -6
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +28 -26
- package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +11 -4
- package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +68 -26
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +44 -9
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +12 -10
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h +23 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +44 -19
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +5 -1
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +16 -28
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +2 -2
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +77 -57
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +23 -12
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +43 -56
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +8 -8
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +52 -70
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc +5 -8
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +1 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +17 -11
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +2 -3
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +42 -51
- package/deps/rocksdb/rocksdb/table/format.cc +258 -104
- package/deps/rocksdb/rocksdb/table/format.h +120 -109
- package/deps/rocksdb/rocksdb/table/get_context.cc +97 -65
- package/deps/rocksdb/rocksdb/table/get_context.h +19 -12
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +14 -0
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/table/merger_test.cc +3 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +11 -21
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +3 -3
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +176 -171
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +47 -33
- package/deps/rocksdb/rocksdb/table/mock_table.cc +7 -9
- package/deps/rocksdb/rocksdb/table/mock_table.h +3 -2
- package/deps/rocksdb/rocksdb/table/multiget_context.h +15 -8
- package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +22 -29
- package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +6 -3
- package/deps/rocksdb/rocksdb/table/plain/plain_table_bloom.h +5 -8
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +29 -26
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +12 -16
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.cc +145 -69
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +1 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +7 -6
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +3 -4
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +3 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +1 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +13 -18
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -9
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +55 -37
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +10 -5
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +11 -8
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +222 -16
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +106 -58
- package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +6 -5
- package/deps/rocksdb/rocksdb/table/table_builder.h +68 -44
- package/deps/rocksdb/rocksdb/table/table_factory.cc +37 -10
- package/deps/rocksdb/rocksdb/table/table_properties.cc +109 -54
- package/deps/rocksdb/rocksdb/table/table_properties_internal.h +4 -20
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +33 -32
- package/deps/rocksdb/rocksdb/table/table_reader_caller.h +2 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +989 -326
- package/deps/rocksdb/rocksdb/table/two_level_iterator.cc +4 -0
- package/deps/rocksdb/rocksdb/table/unique_id.cc +166 -0
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +59 -0
- package/deps/rocksdb/rocksdb/test_util/mock_time_env.cc +1 -1
- package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +13 -10
- package/deps/rocksdb/rocksdb/test_util/sync_point.cc +1 -2
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +35 -16
- package/deps/rocksdb/rocksdb/test_util/sync_point_impl.cc +32 -10
- package/deps/rocksdb/rocksdb/test_util/sync_point_impl.h +31 -4
- package/deps/rocksdb/rocksdb/test_util/testharness.cc +53 -1
- package/deps/rocksdb/rocksdb/test_util/testharness.h +67 -3
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +236 -66
- package/deps/rocksdb/rocksdb/test_util/testutil.h +63 -100
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +12 -1
- package/deps/rocksdb/rocksdb/tools/blob_dump.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +6 -3
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h +1 -0
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +9 -3
- package/deps/rocksdb/rocksdb/tools/db_bench.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +1420 -611
- package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +11 -8
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +11 -1
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +4 -2
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc +46 -22
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +655 -179
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +58 -6
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +472 -29
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +23 -2
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +246 -0
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +126 -0
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +83 -29
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +38 -17
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +191 -55
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +219 -296
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +87 -53
- package/deps/rocksdb/rocksdb/tools/write_stress.cc +8 -7
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc +6 -5
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +5 -4
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc +14 -9
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +134 -60
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.h +49 -38
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +152 -15
- package/deps/rocksdb/rocksdb/trace_replay/trace_record.cc +206 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.cc +190 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.h +46 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_record_result.cc +146 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +475 -344
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.h +83 -95
- package/deps/rocksdb/rocksdb/util/autovector.h +38 -18
- package/deps/rocksdb/rocksdb/util/autovector_test.cc +1 -1
- package/deps/rocksdb/rocksdb/util/bloom_impl.h +4 -0
- package/deps/rocksdb/rocksdb/util/bloom_test.cc +276 -94
- package/deps/rocksdb/rocksdb/util/build_version.cc.in +81 -4
- package/deps/rocksdb/rocksdb/util/cast_util.h +22 -0
- package/deps/rocksdb/rocksdb/util/channel.h +2 -0
- package/deps/rocksdb/rocksdb/util/coding.h +1 -33
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +8 -0
- package/deps/rocksdb/rocksdb/util/comparator.cc +163 -3
- package/deps/rocksdb/rocksdb/util/compression.cc +122 -0
- package/deps/rocksdb/rocksdb/util/compression.h +212 -7
- package/deps/rocksdb/rocksdb/util/compression_context_cache.cc +1 -3
- package/deps/rocksdb/rocksdb/util/crc32c.cc +165 -2
- package/deps/rocksdb/rocksdb/util/crc32c.h +6 -0
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +14 -0
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +3 -0
- package/deps/rocksdb/rocksdb/util/crc32c_test.cc +47 -0
- package/deps/rocksdb/rocksdb/util/defer.h +30 -1
- package/deps/rocksdb/rocksdb/util/defer_test.cc +11 -0
- package/deps/rocksdb/rocksdb/util/duplicate_detector.h +3 -1
- package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +3 -3
- package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +5 -4
- package/deps/rocksdb/rocksdb/util/fastrange.h +2 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +36 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +3 -1
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +512 -52
- package/deps/rocksdb/rocksdb/util/filter_bench.cc +65 -10
- package/deps/rocksdb/rocksdb/util/gflags_compat.h +6 -1
- package/deps/rocksdb/rocksdb/util/hash.cc +121 -3
- package/deps/rocksdb/rocksdb/util/hash.h +31 -1
- package/deps/rocksdb/rocksdb/util/hash128.h +26 -0
- package/deps/rocksdb/rocksdb/util/hash_containers.h +51 -0
- package/deps/rocksdb/rocksdb/util/hash_test.cc +194 -2
- package/deps/rocksdb/rocksdb/util/heap.h +6 -1
- package/deps/rocksdb/rocksdb/util/kv_map.h +1 -1
- package/deps/rocksdb/rocksdb/util/log_write_bench.cc +8 -6
- package/deps/rocksdb/rocksdb/util/math.h +74 -7
- package/deps/rocksdb/rocksdb/util/math128.h +13 -1
- package/deps/rocksdb/rocksdb/util/murmurhash.h +3 -3
- package/deps/rocksdb/rocksdb/util/random.cc +9 -0
- package/deps/rocksdb/rocksdb/util/random.h +6 -0
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +298 -144
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +68 -19
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +335 -23
- package/deps/rocksdb/rocksdb/util/repeatable_thread.h +10 -12
- package/deps/rocksdb/rocksdb/util/repeatable_thread_test.cc +18 -15
- package/deps/rocksdb/rocksdb/util/ribbon_alg.h +98 -74
- package/deps/rocksdb/rocksdb/util/ribbon_config.cc +506 -0
- package/deps/rocksdb/rocksdb/util/ribbon_config.h +182 -0
- package/deps/rocksdb/rocksdb/util/ribbon_impl.h +154 -79
- package/deps/rocksdb/rocksdb/util/ribbon_test.cc +742 -365
- package/deps/rocksdb/rocksdb/util/set_comparator.h +2 -0
- package/deps/rocksdb/rocksdb/util/slice.cc +198 -35
- package/deps/rocksdb/rocksdb/util/slice_test.cc +30 -1
- package/deps/rocksdb/rocksdb/util/status.cc +32 -29
- package/deps/rocksdb/rocksdb/util/stop_watch.h +18 -18
- package/deps/rocksdb/rocksdb/util/string_util.cc +85 -6
- package/deps/rocksdb/rocksdb/util/string_util.h +47 -2
- package/deps/rocksdb/rocksdb/util/thread_guard.h +41 -0
- package/deps/rocksdb/rocksdb/util/thread_local.h +2 -2
- package/deps/rocksdb/rocksdb/util/thread_local_test.cc +22 -24
- package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +7 -6
- package/deps/rocksdb/rocksdb/util/timer.h +55 -46
- package/deps/rocksdb/rocksdb/util/timer_test.cc +50 -48
- package/deps/rocksdb/rocksdb/util/user_comparator_wrapper.h +4 -0
- package/deps/rocksdb/rocksdb/util/vector_iterator.h +31 -15
- package/deps/rocksdb/rocksdb/util/work_queue.h +2 -0
- package/deps/rocksdb/rocksdb/util/xxhash.cc +35 -1144
- package/deps/rocksdb/rocksdb/util/xxhash.h +5117 -373
- package/deps/rocksdb/rocksdb/util/xxph3.h +1762 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +238 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.h +49 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +134 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +104 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.h +47 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +3164 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_impl.h +29 -0
- package/deps/rocksdb/rocksdb/utilities/{backupable/backupable_db_test.cc → backup/backup_engine_test.cc} +1679 -485
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +6 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +14 -9
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +4 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +37 -27
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +8 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h +13 -10
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +5 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +44 -25
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +3 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +27 -19
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +4 -2
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +69 -0
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +489 -0
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +366 -0
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc +67 -4
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h +21 -6
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +107 -7
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h +43 -0
- package/deps/rocksdb/rocksdb/utilities/cassandra/format.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc +24 -8
- package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.h +7 -7
- package/deps/rocksdb/rocksdb/utilities/cassandra/serialize.h +5 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +99 -218
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +8 -24
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +114 -1
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h +6 -2
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -4
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +7 -6
- package/deps/rocksdb/rocksdb/utilities/compaction_filters.cc +56 -0
- package/deps/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +355 -0
- package/deps/rocksdb/rocksdb/utilities/counted_fs.h +152 -0
- package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +13 -0
- package/deps/rocksdb/rocksdb/utilities/env_timed.cc +164 -122
- package/deps/rocksdb/rocksdb/utilities/env_timed.h +97 -0
- package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +75 -17
- package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +19 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +539 -126
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +162 -17
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +110 -0
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +94 -0
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +5 -2
- package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +104 -0
- package/deps/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h +5 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/max.cc +4 -1
- package/deps/rocksdb/rocksdb/utilities/merge_operators/put.cc +11 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.h +5 -1
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc +29 -10
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc +29 -14
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +71 -18
- package/deps/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc +15 -9
- package/deps/rocksdb/rocksdb/utilities/merge_operators.cc +120 -0
- package/deps/rocksdb/rocksdb/utilities/merge_operators.h +3 -23
- package/deps/rocksdb/rocksdb/utilities/object_registry.cc +267 -42
- package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +702 -76
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +26 -5
- package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +124 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +2 -3
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +8 -9
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +15 -13
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +4 -4
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc +8 -9
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc +3 -0
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +43 -35
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +20 -18
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +107 -2
- package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +23 -15
- package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +316 -0
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.h +86 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +4 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +4 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +119 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +20 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h +20 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h +3 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +4 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +38 -14
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h +17 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +423 -34
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +82 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +72 -40
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +32 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +13 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +7 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +207 -43
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +50 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +28 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +11 -6
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +516 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +506 -15
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +27 -13
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +14 -14
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +3 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +14 -5
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +305 -27
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +55 -159
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +209 -2
- package/deps/rocksdb/rocksdb/utilities/wal_filter.cc +23 -0
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +157 -88
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +501 -114
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +91 -316
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +1212 -672
- package/deps/rocksdb/rocksdb.gyp +425 -446
- package/package.json +8 -8
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/darwin-x86/node.napi.node +0 -0
- package/prebuilds/{darwin-x64+arm64 → linux-x64}/node.napi.node +0 -0
- package/deps/rocksdb/rocksdb/env/env_hdfs.cc +0 -648
- package/deps/rocksdb/rocksdb/hdfs/README +0 -23
- package/deps/rocksdb/rocksdb/hdfs/env_hdfs.h +0 -386
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h +0 -535
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h +0 -175
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/utility_db.h +0 -34
- package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator_test.cc +0 -102
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.h +0 -49
- package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.h +0 -44
- package/deps/rocksdb/rocksdb/options/customizable_helper.h +0 -216
- package/deps/rocksdb/rocksdb/third-party/folly/folly/CPortability.h +0 -27
- package/deps/rocksdb/rocksdb/third-party/folly/folly/ConstexprMath.h +0 -45
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Indestructible.h +0 -166
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Optional.h +0 -570
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Portability.h +0 -92
- package/deps/rocksdb/rocksdb/third-party/folly/folly/ScopeGuard.h +0 -54
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Traits.h +0 -152
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Unit.h +0 -59
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Utility.h +0 -141
- package/deps/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h +0 -33
- package/deps/rocksdb/rocksdb/third-party/folly/folly/container/Array.h +0 -74
- package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex-inl.h +0 -117
- package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp +0 -263
- package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.h +0 -96
- package/deps/rocksdb/rocksdb/third-party/folly/folly/functional/Invoke.h +0 -40
- package/deps/rocksdb/rocksdb/third-party/folly/folly/hash/Hash.h +0 -29
- package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h +0 -144
- package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Bits.h +0 -30
- package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Launder.h +0 -51
- package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/Asm.h +0 -28
- package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysSyscall.h +0 -10
- package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysTypes.h +0 -26
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification-inl.h +0 -138
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.cpp +0 -23
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.h +0 -57
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil-inl.h +0 -260
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil.h +0 -52
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h +0 -328
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h +0 -1703
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.cpp +0 -16
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.h +0 -304
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h +0 -39
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.cpp +0 -26
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.h +0 -318
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/WaitOptions.h +0 -57
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h +0 -219
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h +0 -207
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable.h +0 -164
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Sleeper.h +0 -57
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Spin.h +0 -77
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp +0 -1145
- package/deps/rocksdb/rocksdb/util/build_version.h +0 -15
- package/deps/rocksdb/rocksdb/util/xxh3p.h +0 -1392
- package/deps/rocksdb/rocksdb/utilities/backupable/backupable_db.cc +0 -2354
- package/deps/rocksdb/rocksdb/utilities/env_librados.cc +0 -1497
- package/deps/rocksdb/rocksdb/utilities/env_librados_test.cc +0 -1146
- package/prebuilds/linux-x64/node.napi.glibc.node +0 -0
|
@@ -9,11 +9,10 @@
|
|
|
9
9
|
|
|
10
10
|
#include "db/version_set.h"
|
|
11
11
|
|
|
12
|
-
#include <stdio.h>
|
|
13
|
-
|
|
14
12
|
#include <algorithm>
|
|
15
13
|
#include <array>
|
|
16
14
|
#include <cinttypes>
|
|
15
|
+
#include <cstdio>
|
|
17
16
|
#include <list>
|
|
18
17
|
#include <map>
|
|
19
18
|
#include <set>
|
|
@@ -21,10 +20,14 @@
|
|
|
21
20
|
#include <unordered_map>
|
|
22
21
|
#include <vector>
|
|
23
22
|
|
|
24
|
-
#include "
|
|
23
|
+
#include "db/blob/blob_fetcher.h"
|
|
25
24
|
#include "db/blob/blob_file_cache.h"
|
|
26
25
|
#include "db/blob/blob_file_reader.h"
|
|
27
26
|
#include "db/blob/blob_index.h"
|
|
27
|
+
#include "db/blob/blob_log_format.h"
|
|
28
|
+
#include "db/compaction/compaction.h"
|
|
29
|
+
#include "db/compaction/file_pri.h"
|
|
30
|
+
#include "db/dbformat.h"
|
|
28
31
|
#include "db/internal_stats.h"
|
|
29
32
|
#include "db/log_reader.h"
|
|
30
33
|
#include "db/log_writer.h"
|
|
@@ -39,9 +42,11 @@
|
|
|
39
42
|
#include "file/random_access_file_reader.h"
|
|
40
43
|
#include "file/read_write_util.h"
|
|
41
44
|
#include "file/writable_file_writer.h"
|
|
45
|
+
#include "logging/logging.h"
|
|
42
46
|
#include "monitoring/file_read_sample.h"
|
|
43
47
|
#include "monitoring/perf_context_imp.h"
|
|
44
48
|
#include "monitoring/persistent_stats_history.h"
|
|
49
|
+
#include "options/options_helper.h"
|
|
45
50
|
#include "rocksdb/env.h"
|
|
46
51
|
#include "rocksdb/merge_operator.h"
|
|
47
52
|
#include "rocksdb/write_buffer_manager.h"
|
|
@@ -116,10 +121,9 @@ Status OverlapWithIterator(const Comparator* ucmp,
|
|
|
116
121
|
// are MergeInProgress).
|
|
117
122
|
class FilePicker {
|
|
118
123
|
public:
|
|
119
|
-
FilePicker(
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
const Comparator* user_comparator,
|
|
124
|
+
FilePicker(const Slice& user_key, const Slice& ikey,
|
|
125
|
+
autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
|
|
126
|
+
FileIndexer* file_indexer, const Comparator* user_comparator,
|
|
123
127
|
const InternalKeyComparator* internal_comparator)
|
|
124
128
|
: num_levels_(num_levels),
|
|
125
129
|
curr_level_(static_cast<unsigned int>(-1)),
|
|
@@ -127,9 +131,6 @@ class FilePicker {
|
|
|
127
131
|
hit_file_level_(static_cast<unsigned int>(-1)),
|
|
128
132
|
search_left_bound_(0),
|
|
129
133
|
search_right_bound_(FileIndexer::kLevelMaxIndex),
|
|
130
|
-
#ifndef NDEBUG
|
|
131
|
-
files_(files),
|
|
132
|
-
#endif
|
|
133
134
|
level_files_brief_(file_levels),
|
|
134
135
|
is_hit_file_last_in_level_(false),
|
|
135
136
|
curr_file_level_(nullptr),
|
|
@@ -138,9 +139,6 @@ class FilePicker {
|
|
|
138
139
|
file_indexer_(file_indexer),
|
|
139
140
|
user_comparator_(user_comparator),
|
|
140
141
|
internal_comparator_(internal_comparator) {
|
|
141
|
-
#ifdef NDEBUG
|
|
142
|
-
(void)files;
|
|
143
|
-
#endif
|
|
144
142
|
// Setup member variables to search first level.
|
|
145
143
|
search_ended_ = !PrepareNextLevel();
|
|
146
144
|
if (!search_ended_) {
|
|
@@ -210,23 +208,7 @@ class FilePicker {
|
|
|
210
208
|
}
|
|
211
209
|
}
|
|
212
210
|
}
|
|
213
|
-
|
|
214
|
-
// Sanity check to make sure that the files are correctly sorted
|
|
215
|
-
if (prev_file_) {
|
|
216
|
-
if (curr_level_ != 0) {
|
|
217
|
-
int comp_sign = internal_comparator_->Compare(
|
|
218
|
-
prev_file_->largest_key, f->smallest_key);
|
|
219
|
-
assert(comp_sign < 0);
|
|
220
|
-
} else {
|
|
221
|
-
// level == 0, the current file cannot be newer than the previous
|
|
222
|
-
// one. Use compressed data structure, has no attribute seqNo
|
|
223
|
-
assert(curr_index_in_curr_level_ > 0);
|
|
224
|
-
assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
|
|
225
|
-
files_[0][curr_index_in_curr_level_-1]));
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
prev_file_ = f;
|
|
229
|
-
#endif
|
|
211
|
+
|
|
230
212
|
returned_file_level_ = curr_level_;
|
|
231
213
|
if (curr_level_ > 0 && cmp_largest < 0) {
|
|
232
214
|
// No more files to search in this level.
|
|
@@ -258,9 +240,6 @@ class FilePicker {
|
|
|
258
240
|
unsigned int hit_file_level_;
|
|
259
241
|
int32_t search_left_bound_;
|
|
260
242
|
int32_t search_right_bound_;
|
|
261
|
-
#ifndef NDEBUG
|
|
262
|
-
std::vector<FileMetaData*>* files_;
|
|
263
|
-
#endif
|
|
264
243
|
autovector<LevelFilesBrief>* level_files_brief_;
|
|
265
244
|
bool search_ended_;
|
|
266
245
|
bool is_hit_file_last_in_level_;
|
|
@@ -272,9 +251,6 @@ class FilePicker {
|
|
|
272
251
|
FileIndexer* file_indexer_;
|
|
273
252
|
const Comparator* user_comparator_;
|
|
274
253
|
const InternalKeyComparator* internal_comparator_;
|
|
275
|
-
#ifndef NDEBUG
|
|
276
|
-
FdWithKeyRange* prev_file_;
|
|
277
|
-
#endif
|
|
278
254
|
|
|
279
255
|
// Setup local variables to search next level.
|
|
280
256
|
// Returns false if there are no more levels to search.
|
|
@@ -344,9 +320,7 @@ class FilePicker {
|
|
|
344
320
|
}
|
|
345
321
|
start_index_in_curr_level_ = start_index;
|
|
346
322
|
curr_index_in_curr_level_ = start_index;
|
|
347
|
-
|
|
348
|
-
prev_file_ = nullptr;
|
|
349
|
-
#endif
|
|
323
|
+
|
|
350
324
|
return true;
|
|
351
325
|
}
|
|
352
326
|
// curr_level_ = num_levels_. So, no more levels to search.
|
|
@@ -408,7 +382,7 @@ class FilePickerMultiGet {
|
|
|
408
382
|
int GetCurrentLevel() const { return curr_level_; }
|
|
409
383
|
|
|
410
384
|
// Iterates through files in the current level until it finds a file that
|
|
411
|
-
// contains
|
|
385
|
+
// contains at least one key from the MultiGet batch
|
|
412
386
|
bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
|
|
413
387
|
size_t* file_index, FdWithKeyRange** fd,
|
|
414
388
|
bool* is_last_key_in_file) {
|
|
@@ -885,9 +859,10 @@ class LevelIterator final : public InternalIterator {
|
|
|
885
859
|
const FileOptions& file_options,
|
|
886
860
|
const InternalKeyComparator& icomparator,
|
|
887
861
|
const LevelFilesBrief* flevel,
|
|
888
|
-
const SliceTransform
|
|
889
|
-
HistogramImpl* file_read_hist,
|
|
890
|
-
bool skip_filters, int level,
|
|
862
|
+
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
863
|
+
bool should_sample, HistogramImpl* file_read_hist,
|
|
864
|
+
TableReaderCaller caller, bool skip_filters, int level,
|
|
865
|
+
RangeDelAggregator* range_del_agg,
|
|
891
866
|
const std::vector<AtomicCompactionUnitBoundary>*
|
|
892
867
|
compaction_boundaries = nullptr,
|
|
893
868
|
bool allow_unprepared_value = false)
|
|
@@ -907,7 +882,8 @@ class LevelIterator final : public InternalIterator {
|
|
|
907
882
|
level_(level),
|
|
908
883
|
range_del_agg_(range_del_agg),
|
|
909
884
|
pinned_iters_mgr_(nullptr),
|
|
910
|
-
compaction_boundaries_(compaction_boundaries)
|
|
885
|
+
compaction_boundaries_(compaction_boundaries),
|
|
886
|
+
is_next_read_sequential_(false) {
|
|
911
887
|
// Empty level is not supported.
|
|
912
888
|
assert(flevel_ != nullptr && flevel_->num_files > 0);
|
|
913
889
|
}
|
|
@@ -1037,7 +1013,7 @@ class LevelIterator final : public InternalIterator {
|
|
|
1037
1013
|
// `prefix_extractor_` may be non-null even for total order seek. Checking
|
|
1038
1014
|
// this variable is not the right way to identify whether prefix iterator
|
|
1039
1015
|
// is used.
|
|
1040
|
-
const SliceTransform
|
|
1016
|
+
const std::shared_ptr<const SliceTransform>& prefix_extractor_;
|
|
1041
1017
|
|
|
1042
1018
|
HistogramImpl* file_read_hist_;
|
|
1043
1019
|
bool should_sample_;
|
|
@@ -1054,6 +1030,8 @@ class LevelIterator final : public InternalIterator {
|
|
|
1054
1030
|
// To be propagated to RangeDelAggregator in order to safely truncate range
|
|
1055
1031
|
// tombstones.
|
|
1056
1032
|
const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
|
|
1033
|
+
|
|
1034
|
+
bool is_next_read_sequential_;
|
|
1057
1035
|
};
|
|
1058
1036
|
|
|
1059
1037
|
void LevelIterator::Seek(const Slice& target) {
|
|
@@ -1155,7 +1133,9 @@ bool LevelIterator::NextAndGetResult(IterateResult* result) {
|
|
|
1155
1133
|
assert(Valid());
|
|
1156
1134
|
bool is_valid = file_iter_.NextAndGetResult(result);
|
|
1157
1135
|
if (!is_valid) {
|
|
1136
|
+
is_next_read_sequential_ = true;
|
|
1158
1137
|
SkipEmptyFileForward();
|
|
1138
|
+
is_next_read_sequential_ = false;
|
|
1159
1139
|
is_valid = Valid();
|
|
1160
1140
|
if (is_valid) {
|
|
1161
1141
|
result->key = key();
|
|
@@ -1222,6 +1202,12 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
|
|
|
1222
1202
|
}
|
|
1223
1203
|
|
|
1224
1204
|
InternalIterator* old_iter = file_iter_.Set(iter);
|
|
1205
|
+
|
|
1206
|
+
// Update the read pattern for PrefetchBuffer.
|
|
1207
|
+
if (is_next_read_sequential_) {
|
|
1208
|
+
file_iter_.UpdateReadaheadState(old_iter);
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1225
1211
|
if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
|
|
1226
1212
|
pinned_iters_mgr_->PinIterator(old_iter);
|
|
1227
1213
|
} else {
|
|
@@ -1259,7 +1245,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
|
|
|
1259
1245
|
auto ioptions = cfd_->ioptions();
|
|
1260
1246
|
Status s = table_cache->GetTableProperties(
|
|
1261
1247
|
file_options_, cfd_->internal_comparator(), file_meta->fd, tp,
|
|
1262
|
-
mutable_cf_options_.prefix_extractor
|
|
1248
|
+
mutable_cf_options_.prefix_extractor, true /* no io */);
|
|
1263
1249
|
if (s.ok()) {
|
|
1264
1250
|
return s;
|
|
1265
1251
|
}
|
|
@@ -1287,24 +1273,23 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
|
|
|
1287
1273
|
return s;
|
|
1288
1274
|
}
|
|
1289
1275
|
|
|
1290
|
-
|
|
1291
|
-
//
|
|
1292
|
-
// pass the magic number check in the footer.
|
|
1276
|
+
// By setting the magic number to kNullTableMagicNumber, we can bypass
|
|
1277
|
+
// the magic number check in the footer.
|
|
1293
1278
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
1294
1279
|
new RandomAccessFileReader(
|
|
1295
1280
|
std::move(file), file_name, nullptr /* env */, io_tracer_,
|
|
1296
1281
|
nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
|
|
1297
1282
|
nullptr /* rate_limiter */, ioptions->listeners));
|
|
1283
|
+
std::unique_ptr<TableProperties> props;
|
|
1298
1284
|
s = ReadTableProperties(
|
|
1299
1285
|
file_reader.get(), file_meta->fd.GetFileSize(),
|
|
1300
|
-
Footer::
|
|
1301
|
-
&
|
|
1286
|
+
Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
|
|
1287
|
+
&props);
|
|
1302
1288
|
if (!s.ok()) {
|
|
1303
1289
|
return s;
|
|
1304
1290
|
}
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
*tp = std::shared_ptr<const TableProperties>(raw_table_properties);
|
|
1291
|
+
*tp = std::move(props);
|
|
1292
|
+
RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
|
|
1308
1293
|
return s;
|
|
1309
1294
|
}
|
|
1310
1295
|
|
|
@@ -1453,7 +1438,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
|
|
|
1453
1438
|
for (size_t i = 0; i < file_level.num_files; i++) {
|
|
1454
1439
|
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
|
|
1455
1440
|
file_options_, cfd_->internal_comparator(), file_level.files[i].fd,
|
|
1456
|
-
mutable_cf_options_.prefix_extractor
|
|
1441
|
+
mutable_cf_options_.prefix_extractor);
|
|
1457
1442
|
}
|
|
1458
1443
|
}
|
|
1459
1444
|
return total_usage;
|
|
@@ -1468,6 +1453,10 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
|
|
|
1468
1453
|
cf_meta->file_count = 0;
|
|
1469
1454
|
cf_meta->levels.clear();
|
|
1470
1455
|
|
|
1456
|
+
cf_meta->blob_file_size = 0;
|
|
1457
|
+
cf_meta->blob_file_count = 0;
|
|
1458
|
+
cf_meta->blob_files.clear();
|
|
1459
|
+
|
|
1471
1460
|
auto* ioptions = cfd_->ioptions();
|
|
1472
1461
|
auto* vstorage = storage_info();
|
|
1473
1462
|
|
|
@@ -1485,15 +1474,16 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
|
|
|
1485
1474
|
file_path = ioptions->cf_paths.back().path;
|
|
1486
1475
|
}
|
|
1487
1476
|
const uint64_t file_number = file->fd.GetNumber();
|
|
1488
|
-
files.emplace_back(
|
|
1477
|
+
files.emplace_back(
|
|
1489
1478
|
MakeTableFileName("", file_number), file_number, file_path,
|
|
1490
|
-
|
|
1479
|
+
file->fd.GetFileSize(), file->fd.smallest_seqno,
|
|
1491
1480
|
file->fd.largest_seqno, file->smallest.user_key().ToString(),
|
|
1492
1481
|
file->largest.user_key().ToString(),
|
|
1493
1482
|
file->stats.num_reads_sampled.load(std::memory_order_relaxed),
|
|
1494
|
-
file->being_compacted, file->
|
|
1495
|
-
file->
|
|
1496
|
-
file->
|
|
1483
|
+
file->being_compacted, file->temperature,
|
|
1484
|
+
file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
|
|
1485
|
+
file->TryGetFileCreationTime(), file->file_checksum,
|
|
1486
|
+
file->file_checksum_func_name);
|
|
1497
1487
|
files.back().num_entries = file->num_entries;
|
|
1498
1488
|
files.back().num_deletions = file->num_deletions;
|
|
1499
1489
|
level_size += file->fd.GetFileSize();
|
|
@@ -1502,6 +1492,18 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
|
|
|
1502
1492
|
level, level_size, std::move(files));
|
|
1503
1493
|
cf_meta->size += level_size;
|
|
1504
1494
|
}
|
|
1495
|
+
for (const auto& meta : vstorage->GetBlobFiles()) {
|
|
1496
|
+
assert(meta);
|
|
1497
|
+
|
|
1498
|
+
cf_meta->blob_files.emplace_back(
|
|
1499
|
+
meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
|
|
1500
|
+
ioptions->cf_paths.front().path, meta->GetBlobFileSize(),
|
|
1501
|
+
meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
|
|
1502
|
+
meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
|
|
1503
|
+
meta->GetChecksumMethod(), meta->GetChecksumValue());
|
|
1504
|
+
++cf_meta->blob_file_count;
|
|
1505
|
+
cf_meta->blob_file_size += meta->GetBlobFileSize();
|
|
1506
|
+
}
|
|
1505
1507
|
}
|
|
1506
1508
|
|
|
1507
1509
|
uint64_t Version::GetSstFilesSize() {
|
|
@@ -1617,7 +1619,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
|
|
|
1617
1619
|
merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
|
|
1618
1620
|
read_options, soptions, cfd_->internal_comparator(),
|
|
1619
1621
|
*file.file_metadata, range_del_agg,
|
|
1620
|
-
mutable_cf_options_.prefix_extractor
|
|
1622
|
+
mutable_cf_options_.prefix_extractor, nullptr,
|
|
1621
1623
|
cfd_->internal_stats()->GetFileReadHist(0),
|
|
1622
1624
|
TableReaderCaller::kUserIterator, arena,
|
|
1623
1625
|
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
|
|
@@ -1641,7 +1643,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
|
|
|
1641
1643
|
merge_iter_builder->AddIterator(new (mem) LevelIterator(
|
|
1642
1644
|
cfd_->table_cache(), read_options, soptions,
|
|
1643
1645
|
cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
|
|
1644
|
-
mutable_cf_options_.prefix_extractor
|
|
1646
|
+
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
|
|
1645
1647
|
cfd_->internal_stats()->GetFileReadHist(level),
|
|
1646
1648
|
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
|
|
1647
1649
|
range_del_agg,
|
|
@@ -1676,7 +1678,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
|
|
|
1676
1678
|
ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
|
|
1677
1679
|
read_options, file_options, cfd_->internal_comparator(),
|
|
1678
1680
|
*file->file_metadata, &range_del_agg,
|
|
1679
|
-
mutable_cf_options_.prefix_extractor
|
|
1681
|
+
mutable_cf_options_.prefix_extractor, nullptr,
|
|
1680
1682
|
cfd_->internal_stats()->GetFileReadHist(0),
|
|
1681
1683
|
TableReaderCaller::kUserIterator, &arena,
|
|
1682
1684
|
/*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
|
|
@@ -1694,7 +1696,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
|
|
|
1694
1696
|
ScopedArenaIterator iter(new (mem) LevelIterator(
|
|
1695
1697
|
cfd_->table_cache(), read_options, file_options,
|
|
1696
1698
|
cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
|
|
1697
|
-
mutable_cf_options_.prefix_extractor
|
|
1699
|
+
mutable_cf_options_.prefix_extractor, should_sample_file_read(),
|
|
1698
1700
|
cfd_->internal_stats()->GetFileReadHist(level),
|
|
1699
1701
|
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
|
|
1700
1702
|
&range_del_agg));
|
|
@@ -1761,14 +1763,14 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
|
|
|
1761
1763
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
1762
1764
|
uint64_t version_number)
|
|
1763
1765
|
: env_(vset->env_),
|
|
1766
|
+
clock_(vset->clock_),
|
|
1764
1767
|
cfd_(column_family_data),
|
|
1765
|
-
info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->
|
|
1766
|
-
db_statistics_((cfd_ == nullptr) ? nullptr
|
|
1767
|
-
: cfd_->ioptions()->statistics),
|
|
1768
|
+
info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
|
|
1769
|
+
db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
|
|
1768
1770
|
table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
|
|
1769
1771
|
blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr),
|
|
1770
|
-
merge_operator_(
|
|
1771
|
-
|
|
1772
|
+
merge_operator_(
|
|
1773
|
+
(cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
|
|
1772
1774
|
storage_info_(
|
|
1773
1775
|
(cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
|
|
1774
1776
|
(cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
|
|
@@ -1792,11 +1794,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
|
|
|
1792
1794
|
|
|
1793
1795
|
Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
|
|
1794
1796
|
const Slice& blob_index_slice,
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
return Status::Incomplete("Cannot read blob: no disk I/O allowed");
|
|
1798
|
-
}
|
|
1799
|
-
|
|
1797
|
+
FilePrefetchBuffer* prefetch_buffer,
|
|
1798
|
+
PinnableSlice* value, uint64_t* bytes_read) const {
|
|
1800
1799
|
BlobIndex blob_index;
|
|
1801
1800
|
|
|
1802
1801
|
{
|
|
@@ -1806,24 +1805,27 @@ Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
|
|
|
1806
1805
|
}
|
|
1807
1806
|
}
|
|
1808
1807
|
|
|
1809
|
-
return GetBlob(read_options, user_key, blob_index, value
|
|
1808
|
+
return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
|
|
1809
|
+
bytes_read);
|
|
1810
1810
|
}
|
|
1811
1811
|
|
|
1812
1812
|
Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
|
|
1813
1813
|
const BlobIndex& blob_index,
|
|
1814
|
-
|
|
1814
|
+
FilePrefetchBuffer* prefetch_buffer,
|
|
1815
|
+
PinnableSlice* value, uint64_t* bytes_read) const {
|
|
1815
1816
|
assert(value);
|
|
1816
1817
|
|
|
1818
|
+
if (read_options.read_tier == kBlockCacheTier) {
|
|
1819
|
+
return Status::Incomplete("Cannot read blob: no disk I/O allowed");
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1817
1822
|
if (blob_index.HasTTL() || blob_index.IsInlined()) {
|
|
1818
1823
|
return Status::Corruption("Unexpected TTL/inlined blob index");
|
|
1819
1824
|
}
|
|
1820
1825
|
|
|
1821
|
-
const auto& blob_files = storage_info_.GetBlobFiles();
|
|
1822
|
-
|
|
1823
1826
|
const uint64_t blob_file_number = blob_index.file_number();
|
|
1824
1827
|
|
|
1825
|
-
|
|
1826
|
-
if (it == blob_files.end()) {
|
|
1828
|
+
if (!storage_info_.GetBlobFileMetaData(blob_file_number)) {
|
|
1827
1829
|
return Status::Corruption("Invalid blob file number");
|
|
1828
1830
|
}
|
|
1829
1831
|
|
|
@@ -1841,15 +1843,131 @@ Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
|
|
|
1841
1843
|
assert(blob_file_reader.GetValue());
|
|
1842
1844
|
const Status s = blob_file_reader.GetValue()->GetBlob(
|
|
1843
1845
|
read_options, user_key, blob_index.offset(), blob_index.size(),
|
|
1844
|
-
blob_index.compression(), value);
|
|
1846
|
+
blob_index.compression(), prefetch_buffer, value, bytes_read);
|
|
1845
1847
|
|
|
1846
1848
|
return s;
|
|
1847
1849
|
}
|
|
1848
1850
|
|
|
1851
|
+
void Version::MultiGetBlob(
|
|
1852
|
+
const ReadOptions& read_options, MultiGetRange& range,
|
|
1853
|
+
std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs) {
|
|
1854
|
+
if (read_options.read_tier == kBlockCacheTier) {
|
|
1855
|
+
Status s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
|
|
1856
|
+
for (const auto& elem : blob_rqs) {
|
|
1857
|
+
for (const auto& blob_rq : elem.second) {
|
|
1858
|
+
const KeyContext& key_context = blob_rq.second;
|
|
1859
|
+
assert(key_context.s);
|
|
1860
|
+
assert(key_context.s->ok());
|
|
1861
|
+
*(key_context.s) = s;
|
|
1862
|
+
assert(key_context.get_context);
|
|
1863
|
+
auto& get_context = *(key_context.get_context);
|
|
1864
|
+
get_context.MarkKeyMayExist();
|
|
1865
|
+
}
|
|
1866
|
+
}
|
|
1867
|
+
return;
|
|
1868
|
+
}
|
|
1869
|
+
|
|
1870
|
+
assert(!blob_rqs.empty());
|
|
1871
|
+
Status status;
|
|
1872
|
+
|
|
1873
|
+
for (auto& elem : blob_rqs) {
|
|
1874
|
+
const uint64_t blob_file_number = elem.first;
|
|
1875
|
+
|
|
1876
|
+
if (!storage_info_.GetBlobFileMetaData(blob_file_number)) {
|
|
1877
|
+
auto& blobs_in_file = elem.second;
|
|
1878
|
+
for (const auto& blob : blobs_in_file) {
|
|
1879
|
+
const KeyContext& key_context = blob.second;
|
|
1880
|
+
*(key_context.s) = Status::Corruption("Invalid blob file number");
|
|
1881
|
+
}
|
|
1882
|
+
continue;
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
CacheHandleGuard<BlobFileReader> blob_file_reader;
|
|
1886
|
+
assert(blob_file_cache_);
|
|
1887
|
+
status = blob_file_cache_->GetBlobFileReader(blob_file_number,
|
|
1888
|
+
&blob_file_reader);
|
|
1889
|
+
assert(!status.ok() || blob_file_reader.GetValue());
|
|
1890
|
+
|
|
1891
|
+
auto& blobs_in_file = elem.second;
|
|
1892
|
+
if (!status.ok()) {
|
|
1893
|
+
for (const auto& blob : blobs_in_file) {
|
|
1894
|
+
const KeyContext& key_context = blob.second;
|
|
1895
|
+
*(key_context.s) = status;
|
|
1896
|
+
}
|
|
1897
|
+
continue;
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
assert(blob_file_reader.GetValue());
|
|
1901
|
+
const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
|
|
1902
|
+
const CompressionType compression =
|
|
1903
|
+
blob_file_reader.GetValue()->GetCompressionType();
|
|
1904
|
+
|
|
1905
|
+
// sort blobs_in_file by file offset.
|
|
1906
|
+
std::sort(
|
|
1907
|
+
blobs_in_file.begin(), blobs_in_file.end(),
|
|
1908
|
+
[](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
|
|
1909
|
+
assert(lhs.first.file_number() == rhs.first.file_number());
|
|
1910
|
+
return lhs.first.offset() < rhs.first.offset();
|
|
1911
|
+
});
|
|
1912
|
+
|
|
1913
|
+
autovector<std::reference_wrapper<const KeyContext>> blob_read_key_contexts;
|
|
1914
|
+
autovector<std::reference_wrapper<const Slice>> user_keys;
|
|
1915
|
+
autovector<uint64_t> offsets;
|
|
1916
|
+
autovector<uint64_t> value_sizes;
|
|
1917
|
+
autovector<Status*> statuses;
|
|
1918
|
+
autovector<PinnableSlice*> values;
|
|
1919
|
+
for (const auto& blob : blobs_in_file) {
|
|
1920
|
+
const auto& blob_index = blob.first;
|
|
1921
|
+
const KeyContext& key_context = blob.second;
|
|
1922
|
+
if (blob_index.HasTTL() || blob_index.IsInlined()) {
|
|
1923
|
+
*(key_context.s) =
|
|
1924
|
+
Status::Corruption("Unexpected TTL/inlined blob index");
|
|
1925
|
+
continue;
|
|
1926
|
+
}
|
|
1927
|
+
const uint64_t key_size = key_context.ukey_with_ts.size();
|
|
1928
|
+
const uint64_t offset = blob_index.offset();
|
|
1929
|
+
const uint64_t value_size = blob_index.size();
|
|
1930
|
+
if (!IsValidBlobOffset(offset, key_size, value_size, file_size)) {
|
|
1931
|
+
*(key_context.s) = Status::Corruption("Invalid blob offset");
|
|
1932
|
+
continue;
|
|
1933
|
+
}
|
|
1934
|
+
if (blob_index.compression() != compression) {
|
|
1935
|
+
*(key_context.s) =
|
|
1936
|
+
Status::Corruption("Compression type mismatch when reading a blob");
|
|
1937
|
+
continue;
|
|
1938
|
+
}
|
|
1939
|
+
blob_read_key_contexts.emplace_back(std::cref(key_context));
|
|
1940
|
+
user_keys.emplace_back(std::cref(key_context.ukey_with_ts));
|
|
1941
|
+
offsets.push_back(blob_index.offset());
|
|
1942
|
+
value_sizes.push_back(blob_index.size());
|
|
1943
|
+
statuses.push_back(key_context.s);
|
|
1944
|
+
values.push_back(key_context.value);
|
|
1945
|
+
}
|
|
1946
|
+
blob_file_reader.GetValue()->MultiGetBlob(read_options, user_keys, offsets,
|
|
1947
|
+
value_sizes, statuses, values,
|
|
1948
|
+
/*bytes_read=*/nullptr);
|
|
1949
|
+
size_t num = blob_read_key_contexts.size();
|
|
1950
|
+
assert(num == user_keys.size());
|
|
1951
|
+
assert(num == offsets.size());
|
|
1952
|
+
assert(num == value_sizes.size());
|
|
1953
|
+
assert(num == statuses.size());
|
|
1954
|
+
assert(num == values.size());
|
|
1955
|
+
for (size_t i = 0; i < num; ++i) {
|
|
1956
|
+
if (statuses[i]->ok()) {
|
|
1957
|
+
range.AddValueSize(blob_read_key_contexts[i].get().value->size());
|
|
1958
|
+
if (range.GetValueSize() > read_options.value_size_soft_limit) {
|
|
1959
|
+
*(blob_read_key_contexts[i].get().s) = Status::Aborted();
|
|
1960
|
+
}
|
|
1961
|
+
}
|
|
1962
|
+
}
|
|
1963
|
+
}
|
|
1964
|
+
}
|
|
1965
|
+
|
|
1849
1966
|
void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
1850
1967
|
PinnableSlice* value, std::string* timestamp, Status* status,
|
|
1851
1968
|
MergeContext* merge_context,
|
|
1852
|
-
SequenceNumber* max_covering_tombstone_seq,
|
|
1969
|
+
SequenceNumber* max_covering_tombstone_seq,
|
|
1970
|
+
PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
|
|
1853
1971
|
bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
|
|
1854
1972
|
bool* is_blob, bool do_merge) {
|
|
1855
1973
|
Slice ikey = k.internal_key();
|
|
@@ -1862,7 +1980,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
|
1862
1980
|
*key_exists = true;
|
|
1863
1981
|
}
|
|
1864
1982
|
|
|
1865
|
-
PinnedIteratorsManager pinned_iters_mgr;
|
|
1866
1983
|
uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
|
|
1867
1984
|
if (vset_ && vset_->block_cache_tracer_ &&
|
|
1868
1985
|
vset_->block_cache_tracer_->is_tracing_enabled()) {
|
|
@@ -1874,24 +1991,26 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
|
1874
1991
|
// need to provide it here.
|
|
1875
1992
|
bool is_blob_index = false;
|
|
1876
1993
|
bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
|
|
1994
|
+
BlobFetcher blob_fetcher(this, read_options);
|
|
1877
1995
|
|
|
1996
|
+
assert(pinned_iters_mgr);
|
|
1878
1997
|
GetContext get_context(
|
|
1879
1998
|
user_comparator(), merge_operator_, info_log_, db_statistics_,
|
|
1880
1999
|
status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
|
|
1881
2000
|
do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
|
|
1882
|
-
merge_context, do_merge, max_covering_tombstone_seq,
|
|
1883
|
-
merge_operator_ ?
|
|
1884
|
-
tracing_get_id);
|
|
2001
|
+
merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
|
|
2002
|
+
merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
|
|
2003
|
+
tracing_get_id, &blob_fetcher);
|
|
1885
2004
|
|
|
1886
2005
|
// Pin blocks that we read to hold merge operands
|
|
1887
2006
|
if (merge_operator_) {
|
|
1888
|
-
pinned_iters_mgr
|
|
2007
|
+
pinned_iters_mgr->StartPinning();
|
|
1889
2008
|
}
|
|
1890
2009
|
|
|
1891
|
-
FilePicker fp(
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
2010
|
+
FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
|
|
2011
|
+
storage_info_.num_non_empty_levels_,
|
|
2012
|
+
&storage_info_.file_indexer_, user_comparator(),
|
|
2013
|
+
internal_comparator());
|
|
1895
2014
|
FdWithKeyRange* f = fp.GetNextFile();
|
|
1896
2015
|
|
|
1897
2016
|
while (f != nullptr) {
|
|
@@ -1907,10 +2026,10 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
|
1907
2026
|
bool timer_enabled =
|
|
1908
2027
|
GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
|
|
1909
2028
|
get_perf_context()->per_level_perf_context_enabled;
|
|
1910
|
-
StopWatchNano timer(
|
|
2029
|
+
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
|
|
1911
2030
|
*status = table_cache_->Get(
|
|
1912
2031
|
read_options, *internal_comparator(), *f->file_metadata, ikey,
|
|
1913
|
-
&get_context, mutable_cf_options_.prefix_extractor
|
|
2032
|
+
&get_context, mutable_cf_options_.prefix_extractor,
|
|
1914
2033
|
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
|
|
1915
2034
|
IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
|
|
1916
2035
|
fp.IsHitFileLastInLevel()),
|
|
@@ -1921,6 +2040,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
|
1921
2040
|
fp.GetHitFileLevel());
|
|
1922
2041
|
}
|
|
1923
2042
|
if (!status->ok()) {
|
|
2043
|
+
if (db_statistics_ != nullptr) {
|
|
2044
|
+
get_context.ReportCounters();
|
|
2045
|
+
}
|
|
1924
2046
|
return;
|
|
1925
2047
|
}
|
|
1926
2048
|
|
|
@@ -1951,7 +2073,14 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
|
1951
2073
|
|
|
1952
2074
|
if (is_blob_index) {
|
|
1953
2075
|
if (do_merge && value) {
|
|
1954
|
-
|
|
2076
|
+
TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex",
|
|
2077
|
+
value);
|
|
2078
|
+
|
|
2079
|
+
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
|
|
2080
|
+
constexpr uint64_t* bytes_read = nullptr;
|
|
2081
|
+
|
|
2082
|
+
*status = GetBlob(read_options, user_key, *value, prefetch_buffer,
|
|
2083
|
+
value, bytes_read);
|
|
1955
2084
|
if (!status->ok()) {
|
|
1956
2085
|
if (status->IsIncomplete()) {
|
|
1957
2086
|
get_context.MarkKeyMayExist();
|
|
@@ -1996,7 +2125,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
|
|
1996
2125
|
std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
|
|
1997
2126
|
*status = MergeHelper::TimedFullMerge(
|
|
1998
2127
|
merge_operator_, user_key, nullptr, merge_context->GetOperands(),
|
|
1999
|
-
str_value, info_log_, db_statistics_,
|
|
2128
|
+
str_value, info_log_, db_statistics_, clock_,
|
|
2000
2129
|
nullptr /* result_operand */, true);
|
|
2001
2130
|
if (LIKELY(value != nullptr)) {
|
|
2002
2131
|
value->PinSelf();
|
|
@@ -2027,15 +2156,16 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2027
2156
|
// use autovector in order to avoid unnecessary construction of GetContext
|
|
2028
2157
|
// objects, which is expensive
|
|
2029
2158
|
autovector<GetContext, 16> get_ctx;
|
|
2159
|
+
BlobFetcher blob_fetcher(this, read_options);
|
|
2030
2160
|
for (auto iter = range->begin(); iter != range->end(); ++iter) {
|
|
2031
2161
|
assert(iter->s->ok() || iter->s->IsMergeInProgress());
|
|
2032
2162
|
get_ctx.emplace_back(
|
|
2033
2163
|
user_comparator(), merge_operator_, info_log_, db_statistics_,
|
|
2034
2164
|
iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
|
|
2035
2165
|
iter->ukey_with_ts, iter->value, iter->timestamp, nullptr,
|
|
2036
|
-
&(iter->merge_context), true, &iter->max_covering_tombstone_seq,
|
|
2037
|
-
|
|
2038
|
-
|
|
2166
|
+
&(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_,
|
|
2167
|
+
nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
|
|
2168
|
+
&iter->is_blob_index, tracing_mget_id, &blob_fetcher);
|
|
2039
2169
|
// MergeInProgress status, if set, has been transferred to the get_context
|
|
2040
2170
|
// state, so we set status to ok here. From now on, the iter status will
|
|
2041
2171
|
// be used for IO errors, and get_context state will be used for any
|
|
@@ -2060,15 +2190,38 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2060
2190
|
uint64_t num_data_read = 0;
|
|
2061
2191
|
uint64_t num_sst_read = 0;
|
|
2062
2192
|
|
|
2193
|
+
MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
|
|
2194
|
+
// blob_file => [[blob_idx, it], ...]
|
|
2195
|
+
std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
|
|
2196
|
+
int level = -1;
|
|
2197
|
+
|
|
2063
2198
|
while (f != nullptr) {
|
|
2064
2199
|
MultiGetRange file_range = fp.CurrentFileRange();
|
|
2065
2200
|
bool timer_enabled =
|
|
2066
2201
|
GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
|
|
2067
2202
|
get_perf_context()->per_level_perf_context_enabled;
|
|
2068
|
-
|
|
2203
|
+
|
|
2204
|
+
// Report MultiGet stats per level.
|
|
2205
|
+
if (level >= 0 && level != (int)fp.GetHitFileLevel()) {
|
|
2206
|
+
// Dump the stats if the search has moved to the next level and
|
|
2207
|
+
// reset for next level.
|
|
2208
|
+
RecordInHistogram(db_statistics_,
|
|
2209
|
+
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
|
2210
|
+
num_index_read + num_filter_read);
|
|
2211
|
+
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
|
|
2212
|
+
num_data_read);
|
|
2213
|
+
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
|
|
2214
|
+
num_filter_read = 0;
|
|
2215
|
+
num_index_read = 0;
|
|
2216
|
+
num_data_read = 0;
|
|
2217
|
+
num_sst_read = 0;
|
|
2218
|
+
level = fp.GetHitFileLevel();
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
|
|
2069
2222
|
s = table_cache_->MultiGet(
|
|
2070
2223
|
read_options, *internal_comparator(), *f->file_metadata, &file_range,
|
|
2071
|
-
mutable_cf_options_.prefix_extractor
|
|
2224
|
+
mutable_cf_options_.prefix_extractor,
|
|
2072
2225
|
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
|
|
2073
2226
|
IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
|
|
2074
2227
|
fp.IsHitFileLastInLevel()),
|
|
@@ -2109,6 +2262,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2109
2262
|
num_filter_read += get_context.get_context_stats_.num_filter_read;
|
|
2110
2263
|
num_data_read += get_context.get_context_stats_.num_data_read;
|
|
2111
2264
|
num_sst_read += get_context.get_context_stats_.num_sst_read;
|
|
2265
|
+
// Reset these stats since they're specific to a level
|
|
2266
|
+
get_context.get_context_stats_.num_index_read = 0;
|
|
2267
|
+
get_context.get_context_stats_.num_filter_read = 0;
|
|
2268
|
+
get_context.get_context_stats_.num_data_read = 0;
|
|
2269
|
+
get_context.get_context_stats_.num_sst_read = 0;
|
|
2112
2270
|
|
|
2113
2271
|
// report the counters before returning
|
|
2114
2272
|
if (get_context.State() != GetContext::kNotFound &&
|
|
@@ -2145,22 +2303,27 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2145
2303
|
|
|
2146
2304
|
if (iter->is_blob_index) {
|
|
2147
2305
|
if (iter->value) {
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2306
|
+
TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
|
|
2307
|
+
&(*iter));
|
|
2308
|
+
|
|
2309
|
+
const Slice& blob_index_slice = *(iter->value);
|
|
2310
|
+
BlobIndex blob_index;
|
|
2311
|
+
Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
|
|
2312
|
+
if (tmp_s.ok()) {
|
|
2313
|
+
const uint64_t blob_file_num = blob_index.file_number();
|
|
2314
|
+
blob_rqs[blob_file_num].emplace_back(
|
|
2315
|
+
std::make_pair(blob_index, std::cref(*iter)));
|
|
2316
|
+
} else {
|
|
2317
|
+
*(iter->s) = tmp_s;
|
|
2156
2318
|
}
|
|
2157
2319
|
}
|
|
2158
|
-
}
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2320
|
+
} else {
|
|
2321
|
+
file_range.AddValueSize(iter->value->size());
|
|
2322
|
+
if (file_range.GetValueSize() >
|
|
2323
|
+
read_options.value_size_soft_limit) {
|
|
2324
|
+
s = Status::Aborted();
|
|
2325
|
+
break;
|
|
2326
|
+
}
|
|
2164
2327
|
}
|
|
2165
2328
|
continue;
|
|
2166
2329
|
case GetContext::kDeleted:
|
|
@@ -2183,22 +2346,6 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2183
2346
|
}
|
|
2184
2347
|
}
|
|
2185
2348
|
|
|
2186
|
-
// Report MultiGet stats per level.
|
|
2187
|
-
if (fp.IsHitFileLastInLevel()) {
|
|
2188
|
-
// Dump the stats if this is the last file of this level and reset for
|
|
2189
|
-
// next level.
|
|
2190
|
-
RecordInHistogram(db_statistics_,
|
|
2191
|
-
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
|
2192
|
-
num_index_read + num_filter_read);
|
|
2193
|
-
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
|
|
2194
|
-
num_data_read);
|
|
2195
|
-
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
|
|
2196
|
-
num_filter_read = 0;
|
|
2197
|
-
num_index_read = 0;
|
|
2198
|
-
num_data_read = 0;
|
|
2199
|
-
num_sst_read = 0;
|
|
2200
|
-
}
|
|
2201
|
-
|
|
2202
2349
|
RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
|
|
2203
2350
|
if (!s.ok() || file_picker_range.empty()) {
|
|
2204
2351
|
break;
|
|
@@ -2206,6 +2353,17 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2206
2353
|
f = fp.GetNextFile();
|
|
2207
2354
|
}
|
|
2208
2355
|
|
|
2356
|
+
// Dump stats for most recent level
|
|
2357
|
+
RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
|
2358
|
+
num_index_read + num_filter_read);
|
|
2359
|
+
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
|
|
2360
|
+
num_data_read);
|
|
2361
|
+
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
|
|
2362
|
+
|
|
2363
|
+
if (s.ok() && !blob_rqs.empty()) {
|
|
2364
|
+
MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);
|
|
2365
|
+
}
|
|
2366
|
+
|
|
2209
2367
|
// Process any left over keys
|
|
2210
2368
|
for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
|
|
2211
2369
|
GetContext& get_context = *iter->get_context;
|
|
@@ -2228,7 +2386,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2228
2386
|
iter->value != nullptr ? iter->value->GetSelf() : nullptr;
|
|
2229
2387
|
*status = MergeHelper::TimedFullMerge(
|
|
2230
2388
|
merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
|
|
2231
|
-
str_value, info_log_, db_statistics_,
|
|
2389
|
+
str_value, info_log_, db_statistics_, clock_,
|
|
2232
2390
|
nullptr /* result_operand */, true);
|
|
2233
2391
|
if (LIKELY(iter->value != nullptr)) {
|
|
2234
2392
|
iter->value->PinSelf();
|
|
@@ -2267,20 +2425,31 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
|
|
|
2267
2425
|
}
|
|
2268
2426
|
}
|
|
2269
2427
|
|
|
2270
|
-
void
|
|
2271
|
-
const
|
|
2272
|
-
|
|
2428
|
+
void VersionStorageInfo::PrepareForVersionAppend(
|
|
2429
|
+
const ImmutableOptions& immutable_options,
|
|
2430
|
+
const MutableCFOptions& mutable_cf_options) {
|
|
2431
|
+
ComputeCompensatedSizes();
|
|
2432
|
+
UpdateNumNonEmptyLevels();
|
|
2433
|
+
CalculateBaseBytes(immutable_options, mutable_cf_options);
|
|
2434
|
+
UpdateFilesByCompactionPri(immutable_options, mutable_cf_options);
|
|
2435
|
+
GenerateFileIndexer();
|
|
2436
|
+
GenerateLevelFilesBrief();
|
|
2437
|
+
GenerateLevel0NonOverlapping();
|
|
2438
|
+
GenerateBottommostFiles();
|
|
2439
|
+
GenerateFileLocationIndex();
|
|
2440
|
+
}
|
|
2441
|
+
|
|
2442
|
+
void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options,
|
|
2443
|
+
bool update_stats) {
|
|
2273
2444
|
TEST_SYNC_POINT_CALLBACK(
|
|
2274
|
-
"Version::
|
|
2445
|
+
"Version::PrepareAppend:forced_check",
|
|
2275
2446
|
reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
storage_info_.
|
|
2282
|
-
storage_info_.GenerateLevel0NonOverlapping();
|
|
2283
|
-
storage_info_.GenerateBottommostFiles();
|
|
2447
|
+
|
|
2448
|
+
if (update_stats) {
|
|
2449
|
+
UpdateAccumulatedStats();
|
|
2450
|
+
}
|
|
2451
|
+
|
|
2452
|
+
storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options);
|
|
2284
2453
|
}
|
|
2285
2454
|
|
|
2286
2455
|
bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
|
|
@@ -2334,59 +2503,54 @@ void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
|
|
|
2334
2503
|
}
|
|
2335
2504
|
}
|
|
2336
2505
|
|
|
2337
|
-
void Version::UpdateAccumulatedStats(
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
break;
|
|
2370
|
-
}
|
|
2506
|
+
void Version::UpdateAccumulatedStats() {
|
|
2507
|
+
// maximum number of table properties loaded from files.
|
|
2508
|
+
const int kMaxInitCount = 20;
|
|
2509
|
+
int init_count = 0;
|
|
2510
|
+
// here only the first kMaxInitCount files which haven't been
|
|
2511
|
+
// initialized from file will be updated with num_deletions.
|
|
2512
|
+
// The motivation here is to cap the maximum I/O per Version creation.
|
|
2513
|
+
// The reason for choosing files from lower-level instead of higher-level
|
|
2514
|
+
// is that such design is able to propagate the initialization from
|
|
2515
|
+
// lower-level to higher-level: When the num_deletions of lower-level
|
|
2516
|
+
// files are updated, it will make the lower-level files have accurate
|
|
2517
|
+
// compensated_file_size, making lower-level to higher-level compaction
|
|
2518
|
+
// will be triggered, which creates higher-level files whose num_deletions
|
|
2519
|
+
// will be updated here.
|
|
2520
|
+
for (int level = 0;
|
|
2521
|
+
level < storage_info_.num_levels_ && init_count < kMaxInitCount;
|
|
2522
|
+
++level) {
|
|
2523
|
+
for (auto* file_meta : storage_info_.files_[level]) {
|
|
2524
|
+
if (MaybeInitializeFileMetaData(file_meta)) {
|
|
2525
|
+
// each FileMeta will be initialized only once.
|
|
2526
|
+
storage_info_.UpdateAccumulatedStats(file_meta);
|
|
2527
|
+
// when option "max_open_files" is -1, all the file metadata has
|
|
2528
|
+
// already been read, so MaybeInitializeFileMetaData() won't incur
|
|
2529
|
+
// any I/O cost. "max_open_files=-1" means that the table cache passed
|
|
2530
|
+
// to the VersionSet and then to the ColumnFamilySet has a size of
|
|
2531
|
+
// TableCache::kInfiniteCapacity
|
|
2532
|
+
if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
|
|
2533
|
+
TableCache::kInfiniteCapacity) {
|
|
2534
|
+
continue;
|
|
2535
|
+
}
|
|
2536
|
+
if (++init_count >= kMaxInitCount) {
|
|
2537
|
+
break;
|
|
2371
2538
|
}
|
|
2372
2539
|
}
|
|
2373
2540
|
}
|
|
2374
|
-
|
|
2375
|
-
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
}
|
|
2541
|
+
}
|
|
2542
|
+
// In case all sampled-files contain only deletion entries, then we
|
|
2543
|
+
// load the table-property of a file in higher-level to initialize
|
|
2544
|
+
// that value.
|
|
2545
|
+
for (int level = storage_info_.num_levels_ - 1;
|
|
2546
|
+
storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
|
|
2547
|
+
for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
|
|
2548
|
+
storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
|
|
2549
|
+
if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
|
|
2550
|
+
storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
|
|
2385
2551
|
}
|
|
2386
2552
|
}
|
|
2387
2553
|
}
|
|
2388
|
-
|
|
2389
|
-
storage_info_.ComputeCompensatedSizes();
|
|
2390
2554
|
}
|
|
2391
2555
|
|
|
2392
2556
|
void VersionStorageInfo::ComputeCompensatedSizes() {
|
|
@@ -2521,13 +2685,13 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded(
|
|
|
2521
2685
|
}
|
|
2522
2686
|
|
|
2523
2687
|
namespace {
|
|
2524
|
-
uint32_t GetExpiredTtlFilesCount(const
|
|
2688
|
+
uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
|
|
2525
2689
|
const MutableCFOptions& mutable_cf_options,
|
|
2526
2690
|
const std::vector<FileMetaData*>& files) {
|
|
2527
2691
|
uint32_t ttl_expired_files_count = 0;
|
|
2528
2692
|
|
|
2529
2693
|
int64_t _current_time;
|
|
2530
|
-
auto status = ioptions.
|
|
2694
|
+
auto status = ioptions.clock->GetCurrentTime(&_current_time);
|
|
2531
2695
|
if (status.ok()) {
|
|
2532
2696
|
const uint64_t current_time = static_cast<uint64_t>(_current_time);
|
|
2533
2697
|
for (FileMetaData* f : files) {
|
|
@@ -2545,7 +2709,7 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
|
|
|
2545
2709
|
} // anonymous namespace
|
|
2546
2710
|
|
|
2547
2711
|
void VersionStorageInfo::ComputeCompactionScore(
|
|
2548
|
-
const
|
|
2712
|
+
const ImmutableOptions& immutable_options,
|
|
2549
2713
|
const MutableCFOptions& mutable_cf_options) {
|
|
2550
2714
|
for (int level = 0; level <= MaxInputLevel(); level++) {
|
|
2551
2715
|
double score;
|
|
@@ -2588,7 +2752,12 @@ void VersionStorageInfo::ComputeCompactionScore(
|
|
|
2588
2752
|
if (compaction_style_ == kCompactionStyleFIFO) {
|
|
2589
2753
|
score = static_cast<double>(total_size) /
|
|
2590
2754
|
mutable_cf_options.compaction_options_fifo.max_table_files_size;
|
|
2591
|
-
if (mutable_cf_options.compaction_options_fifo.allow_compaction
|
|
2755
|
+
if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
|
|
2756
|
+
mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
|
|
2757
|
+
// Warm tier move can happen at any time. It's too expensive to
|
|
2758
|
+
// check very file's timestamp now. For now, just trigger it
|
|
2759
|
+
// slightly more frequently than FIFO compaction so that this
|
|
2760
|
+
// happens first.
|
|
2592
2761
|
score = std::max(
|
|
2593
2762
|
static_cast<double>(num_sorted_runs) /
|
|
2594
2763
|
mutable_cf_options.level0_file_num_compaction_trigger,
|
|
@@ -2597,10 +2766,9 @@ void VersionStorageInfo::ComputeCompactionScore(
|
|
|
2597
2766
|
if (mutable_cf_options.ttl > 0) {
|
|
2598
2767
|
score = std::max(
|
|
2599
2768
|
static_cast<double>(GetExpiredTtlFilesCount(
|
|
2600
|
-
|
|
2769
|
+
immutable_options, mutable_cf_options, files_[level])),
|
|
2601
2770
|
score);
|
|
2602
2771
|
}
|
|
2603
|
-
|
|
2604
2772
|
} else {
|
|
2605
2773
|
score = static_cast<double>(num_sorted_runs) /
|
|
2606
2774
|
mutable_cf_options.level0_file_num_compaction_trigger;
|
|
@@ -2609,7 +2777,7 @@ void VersionStorageInfo::ComputeCompactionScore(
|
|
|
2609
2777
|
// L0 files. Take into account size as well to avoid later giant
|
|
2610
2778
|
// compactions to the base level.
|
|
2611
2779
|
uint64_t l0_target_size = mutable_cf_options.max_bytes_for_level_base;
|
|
2612
|
-
if (
|
|
2780
|
+
if (immutable_options.level_compaction_dynamic_level_bytes &&
|
|
2613
2781
|
level_multiplier_ != 0.0) {
|
|
2614
2782
|
// Prevent L0 to Lbase fanout from growing larger than
|
|
2615
2783
|
// `level_multiplier_`. This prevents us from getting stuck picking
|
|
@@ -2657,12 +2825,21 @@ void VersionStorageInfo::ComputeCompactionScore(
|
|
|
2657
2825
|
ComputeFilesMarkedForCompaction();
|
|
2658
2826
|
ComputeBottommostFilesMarkedForCompaction();
|
|
2659
2827
|
if (mutable_cf_options.ttl > 0) {
|
|
2660
|
-
ComputeExpiredTtlFiles(
|
|
2828
|
+
ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
|
|
2661
2829
|
}
|
|
2662
2830
|
if (mutable_cf_options.periodic_compaction_seconds > 0) {
|
|
2663
2831
|
ComputeFilesMarkedForPeriodicCompaction(
|
|
2664
|
-
|
|
2832
|
+
immutable_options, mutable_cf_options.periodic_compaction_seconds);
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
if (mutable_cf_options.enable_blob_garbage_collection &&
|
|
2836
|
+
mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
|
|
2837
|
+
mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
|
|
2838
|
+
ComputeFilesMarkedForForcedBlobGC(
|
|
2839
|
+
mutable_cf_options.blob_garbage_collection_age_cutoff,
|
|
2840
|
+
mutable_cf_options.blob_garbage_collection_force_threshold);
|
|
2665
2841
|
}
|
|
2842
|
+
|
|
2666
2843
|
EstimateCompactionBytesNeeded(mutable_cf_options);
|
|
2667
2844
|
}
|
|
2668
2845
|
|
|
@@ -2690,13 +2867,13 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
|
|
|
2690
2867
|
}
|
|
2691
2868
|
|
|
2692
2869
|
void VersionStorageInfo::ComputeExpiredTtlFiles(
|
|
2693
|
-
const
|
|
2870
|
+
const ImmutableOptions& ioptions, const uint64_t ttl) {
|
|
2694
2871
|
assert(ttl > 0);
|
|
2695
2872
|
|
|
2696
2873
|
expired_ttl_files_.clear();
|
|
2697
2874
|
|
|
2698
2875
|
int64_t _current_time;
|
|
2699
|
-
auto status = ioptions.
|
|
2876
|
+
auto status = ioptions.clock->GetCurrentTime(&_current_time);
|
|
2700
2877
|
if (!status.ok()) {
|
|
2701
2878
|
return;
|
|
2702
2879
|
}
|
|
@@ -2716,14 +2893,14 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
|
|
|
2716
2893
|
}
|
|
2717
2894
|
|
|
2718
2895
|
void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
|
|
2719
|
-
const
|
|
2896
|
+
const ImmutableOptions& ioptions,
|
|
2720
2897
|
const uint64_t periodic_compaction_seconds) {
|
|
2721
2898
|
assert(periodic_compaction_seconds > 0);
|
|
2722
2899
|
|
|
2723
2900
|
files_marked_for_periodic_compaction_.clear();
|
|
2724
2901
|
|
|
2725
2902
|
int64_t temp_current_time;
|
|
2726
|
-
auto status = ioptions.
|
|
2903
|
+
auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
|
|
2727
2904
|
if (!status.ok()) {
|
|
2728
2905
|
return;
|
|
2729
2906
|
}
|
|
@@ -2757,7 +2934,7 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
|
|
|
2757
2934
|
status = ioptions.env->GetFileModificationTime(
|
|
2758
2935
|
file_path, &file_modification_time);
|
|
2759
2936
|
if (!status.ok()) {
|
|
2760
|
-
ROCKS_LOG_WARN(ioptions.
|
|
2937
|
+
ROCKS_LOG_WARN(ioptions.logger,
|
|
2761
2938
|
"Can't get file modification time: %s: %s",
|
|
2762
2939
|
file_path.c_str(), status.ToString().c_str());
|
|
2763
2940
|
continue;
|
|
@@ -2772,6 +2949,112 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
|
|
|
2772
2949
|
}
|
|
2773
2950
|
}
|
|
2774
2951
|
|
|
2952
|
+
void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
|
|
2953
|
+
double blob_garbage_collection_age_cutoff,
|
|
2954
|
+
double blob_garbage_collection_force_threshold) {
|
|
2955
|
+
files_marked_for_forced_blob_gc_.clear();
|
|
2956
|
+
|
|
2957
|
+
if (blob_files_.empty()) {
|
|
2958
|
+
return;
|
|
2959
|
+
}
|
|
2960
|
+
|
|
2961
|
+
// Number of blob files eligible for GC based on age
|
|
2962
|
+
const size_t cutoff_count = static_cast<size_t>(
|
|
2963
|
+
blob_garbage_collection_age_cutoff * blob_files_.size());
|
|
2964
|
+
if (!cutoff_count) {
|
|
2965
|
+
return;
|
|
2966
|
+
}
|
|
2967
|
+
|
|
2968
|
+
// Compute the sum of total and garbage bytes over the oldest batch of blob
|
|
2969
|
+
// files. The oldest batch is defined as the set of blob files which are
|
|
2970
|
+
// kept alive by the same SSTs as the very oldest one. Here is a toy example.
|
|
2971
|
+
// Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
|
|
2972
|
+
// 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
|
|
2973
|
+
// potentially some higher-numbered ones, while SST 3 relies on blob file 12
|
|
2974
|
+
// and potentially some higher-numbered ones. Then, the SST to oldest blob
|
|
2975
|
+
// file mapping is as follows:
|
|
2976
|
+
//
|
|
2977
|
+
// SST file number Oldest blob file number
|
|
2978
|
+
// 1 10
|
|
2979
|
+
// 2 10
|
|
2980
|
+
// 3 12
|
|
2981
|
+
//
|
|
2982
|
+
// This is what the same thing looks like from the blob files' POV. (Note that
|
|
2983
|
+
// the linked SSTs simply denote the inverse mapping of the above.)
|
|
2984
|
+
//
|
|
2985
|
+
// Blob file number Linked SST set
|
|
2986
|
+
// 10 {1, 2}
|
|
2987
|
+
// 11 {}
|
|
2988
|
+
// 12 {3}
|
|
2989
|
+
// 13 {}
|
|
2990
|
+
//
|
|
2991
|
+
// Then, the oldest batch of blob files consists of blob files 10 and 11,
|
|
2992
|
+
// and we can get rid of them by forcing the compaction of SSTs 1 and 2.
|
|
2993
|
+
//
|
|
2994
|
+
// Note that the overall ratio of garbage computed for the batch has to exceed
|
|
2995
|
+
// blob_garbage_collection_force_threshold and the entire batch has to be
|
|
2996
|
+
// eligible for GC according to blob_garbage_collection_age_cutoff in order
|
|
2997
|
+
// for us to schedule any compactions.
|
|
2998
|
+
const auto& oldest_meta = blob_files_.front();
|
|
2999
|
+
assert(oldest_meta);
|
|
3000
|
+
|
|
3001
|
+
const auto& linked_ssts = oldest_meta->GetLinkedSsts();
|
|
3002
|
+
assert(!linked_ssts.empty());
|
|
3003
|
+
|
|
3004
|
+
size_t count = 1;
|
|
3005
|
+
uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
|
|
3006
|
+
uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
|
|
3007
|
+
|
|
3008
|
+
assert(cutoff_count <= blob_files_.size());
|
|
3009
|
+
|
|
3010
|
+
for (; count < cutoff_count; ++count) {
|
|
3011
|
+
const auto& meta = blob_files_[count];
|
|
3012
|
+
assert(meta);
|
|
3013
|
+
|
|
3014
|
+
if (!meta->GetLinkedSsts().empty()) {
|
|
3015
|
+
// Found the beginning of the next batch of blob files
|
|
3016
|
+
break;
|
|
3017
|
+
}
|
|
3018
|
+
|
|
3019
|
+
sum_total_blob_bytes += meta->GetTotalBlobBytes();
|
|
3020
|
+
sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
|
|
3021
|
+
}
|
|
3022
|
+
|
|
3023
|
+
if (count < blob_files_.size()) {
|
|
3024
|
+
const auto& meta = blob_files_[count];
|
|
3025
|
+
assert(meta);
|
|
3026
|
+
|
|
3027
|
+
if (meta->GetLinkedSsts().empty()) {
|
|
3028
|
+
// Some files in the oldest batch are not eligible for GC
|
|
3029
|
+
return;
|
|
3030
|
+
}
|
|
3031
|
+
}
|
|
3032
|
+
|
|
3033
|
+
if (sum_garbage_blob_bytes <
|
|
3034
|
+
blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
|
|
3035
|
+
return;
|
|
3036
|
+
}
|
|
3037
|
+
|
|
3038
|
+
for (uint64_t sst_file_number : linked_ssts) {
|
|
3039
|
+
const FileLocation location = GetFileLocation(sst_file_number);
|
|
3040
|
+
assert(location.IsValid());
|
|
3041
|
+
|
|
3042
|
+
const int level = location.GetLevel();
|
|
3043
|
+
assert(level >= 0);
|
|
3044
|
+
|
|
3045
|
+
const size_t pos = location.GetPosition();
|
|
3046
|
+
|
|
3047
|
+
FileMetaData* const sst_meta = files_[level][pos];
|
|
3048
|
+
assert(sst_meta);
|
|
3049
|
+
|
|
3050
|
+
if (sst_meta->being_compacted) {
|
|
3051
|
+
continue;
|
|
3052
|
+
}
|
|
3053
|
+
|
|
3054
|
+
files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
|
|
3055
|
+
}
|
|
3056
|
+
}
|
|
3057
|
+
|
|
2775
3058
|
namespace {
|
|
2776
3059
|
|
|
2777
3060
|
// used to sort files by size
|
|
@@ -2780,7 +3063,7 @@ struct Fsize {
|
|
|
2780
3063
|
FileMetaData* file;
|
|
2781
3064
|
};
|
|
2782
3065
|
|
|
2783
|
-
//
|
|
3066
|
+
// Comparator that is used to sort files based on their size
|
|
2784
3067
|
// In normal mode: descending size
|
|
2785
3068
|
bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
|
|
2786
3069
|
return (first.file->compensated_file_size >
|
|
@@ -2793,38 +3076,32 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
|
|
|
2793
3076
|
level_files.push_back(f);
|
|
2794
3077
|
|
|
2795
3078
|
f->refs++;
|
|
2796
|
-
|
|
2797
|
-
const uint64_t file_number = f->fd.GetNumber();
|
|
2798
|
-
|
|
2799
|
-
assert(file_locations_.find(file_number) == file_locations_.end());
|
|
2800
|
-
file_locations_.emplace(file_number,
|
|
2801
|
-
FileLocation(level, level_files.size() - 1));
|
|
2802
3079
|
}
|
|
2803
3080
|
|
|
2804
3081
|
void VersionStorageInfo::AddBlobFile(
|
|
2805
3082
|
std::shared_ptr<BlobFileMetaData> blob_file_meta) {
|
|
2806
3083
|
assert(blob_file_meta);
|
|
2807
3084
|
|
|
2808
|
-
|
|
3085
|
+
assert(blob_files_.empty() ||
|
|
3086
|
+
(blob_files_.back() && blob_files_.back()->GetBlobFileNumber() <
|
|
3087
|
+
blob_file_meta->GetBlobFileNumber()));
|
|
2809
3088
|
|
|
2810
|
-
|
|
2811
|
-
|
|
3089
|
+
blob_files_.emplace_back(std::move(blob_file_meta));
|
|
3090
|
+
}
|
|
2812
3091
|
|
|
2813
|
-
|
|
2814
|
-
|
|
3092
|
+
VersionStorageInfo::BlobFiles::const_iterator
|
|
3093
|
+
VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const {
|
|
3094
|
+
return std::lower_bound(
|
|
3095
|
+
blob_files_.begin(), blob_files_.end(), blob_file_number,
|
|
3096
|
+
[](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) {
|
|
3097
|
+
assert(lhs);
|
|
3098
|
+
return lhs->GetBlobFileNumber() < rhs;
|
|
3099
|
+
});
|
|
2815
3100
|
}
|
|
2816
3101
|
|
|
2817
|
-
// Version::PrepareApply() need to be called before calling the function, or
|
|
2818
|
-
// following functions called:
|
|
2819
|
-
// 1. UpdateNumNonEmptyLevels();
|
|
2820
|
-
// 2. CalculateBaseBytes();
|
|
2821
|
-
// 3. UpdateFilesByCompactionPri();
|
|
2822
|
-
// 4. GenerateFileIndexer();
|
|
2823
|
-
// 5. GenerateLevelFilesBrief();
|
|
2824
|
-
// 6. GenerateLevel0NonOverlapping();
|
|
2825
|
-
// 7. GenerateBottommostFiles();
|
|
2826
3102
|
void VersionStorageInfo::SetFinalized() {
|
|
2827
3103
|
finalized_ = true;
|
|
3104
|
+
|
|
2828
3105
|
#ifndef NDEBUG
|
|
2829
3106
|
if (compaction_style_ != kCompactionStyleLevel) {
|
|
2830
3107
|
// Not level based compaction.
|
|
@@ -2875,11 +3152,22 @@ namespace {
|
|
|
2875
3152
|
// Sort `temp` based on ratio of overlapping size over file size
|
|
2876
3153
|
void SortFileByOverlappingRatio(
|
|
2877
3154
|
const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
|
|
2878
|
-
const std::vector<FileMetaData*>& next_level_files,
|
|
3155
|
+
const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
|
|
3156
|
+
int level, int num_non_empty_levels, uint64_t ttl,
|
|
2879
3157
|
std::vector<Fsize>* temp) {
|
|
2880
3158
|
std::unordered_map<uint64_t, uint64_t> file_to_order;
|
|
2881
3159
|
auto next_level_it = next_level_files.begin();
|
|
2882
3160
|
|
|
3161
|
+
int64_t curr_time;
|
|
3162
|
+
Status status = clock->GetCurrentTime(&curr_time);
|
|
3163
|
+
if (!status.ok()) {
|
|
3164
|
+
// If we can't get time, disable TTL.
|
|
3165
|
+
ttl = 0;
|
|
3166
|
+
}
|
|
3167
|
+
|
|
3168
|
+
FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
|
|
3169
|
+
num_non_empty_levels, level);
|
|
3170
|
+
|
|
2883
3171
|
for (auto& file : files) {
|
|
2884
3172
|
uint64_t overlapping_bytes = 0;
|
|
2885
3173
|
// Skip files in next level that is smaller than current file
|
|
@@ -2899,9 +3187,12 @@ void SortFileByOverlappingRatio(
|
|
|
2899
3187
|
next_level_it++;
|
|
2900
3188
|
}
|
|
2901
3189
|
|
|
3190
|
+
uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
|
|
3191
|
+
assert(ttl_boost_score > 0);
|
|
2902
3192
|
assert(file->compensated_file_size != 0);
|
|
2903
|
-
file_to_order[file->fd.GetNumber()] =
|
|
2904
|
-
|
|
3193
|
+
file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
|
|
3194
|
+
file->compensated_file_size /
|
|
3195
|
+
ttl_boost_score;
|
|
2905
3196
|
}
|
|
2906
3197
|
|
|
2907
3198
|
std::sort(temp->begin(), temp->end(),
|
|
@@ -2913,7 +3204,7 @@ void SortFileByOverlappingRatio(
|
|
|
2913
3204
|
} // namespace
|
|
2914
3205
|
|
|
2915
3206
|
void VersionStorageInfo::UpdateFilesByCompactionPri(
|
|
2916
|
-
|
|
3207
|
+
const ImmutableOptions& ioptions, const MutableCFOptions& options) {
|
|
2917
3208
|
if (compaction_style_ == kCompactionStyleNone ||
|
|
2918
3209
|
compaction_style_ == kCompactionStyleFIFO ||
|
|
2919
3210
|
compaction_style_ == kCompactionStyleUniversal) {
|
|
@@ -2938,7 +3229,7 @@ void VersionStorageInfo::UpdateFilesByCompactionPri(
|
|
|
2938
3229
|
if (num > temp.size()) {
|
|
2939
3230
|
num = temp.size();
|
|
2940
3231
|
}
|
|
2941
|
-
switch (compaction_pri) {
|
|
3232
|
+
switch (ioptions.compaction_pri) {
|
|
2942
3233
|
case kByCompensatedSize:
|
|
2943
3234
|
std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
|
|
2944
3235
|
CompareCompensatedSizeDescending);
|
|
@@ -2959,7 +3250,8 @@ void VersionStorageInfo::UpdateFilesByCompactionPri(
|
|
|
2959
3250
|
break;
|
|
2960
3251
|
case kMinOverlappingRatio:
|
|
2961
3252
|
SortFileByOverlappingRatio(*internal_comparator_, files_[level],
|
|
2962
|
-
files_[level + 1],
|
|
3253
|
+
files_[level + 1], ioptions.clock, level,
|
|
3254
|
+
num_non_empty_levels_, options.ttl, &temp);
|
|
2963
3255
|
break;
|
|
2964
3256
|
default:
|
|
2965
3257
|
assert(false);
|
|
@@ -3027,6 +3319,28 @@ void VersionStorageInfo::GenerateBottommostFiles() {
|
|
|
3027
3319
|
}
|
|
3028
3320
|
}
|
|
3029
3321
|
|
|
3322
|
+
void VersionStorageInfo::GenerateFileLocationIndex() {
|
|
3323
|
+
size_t num_files = 0;
|
|
3324
|
+
|
|
3325
|
+
for (int level = 0; level < num_levels_; ++level) {
|
|
3326
|
+
num_files += files_[level].size();
|
|
3327
|
+
}
|
|
3328
|
+
|
|
3329
|
+
file_locations_.reserve(num_files);
|
|
3330
|
+
|
|
3331
|
+
for (int level = 0; level < num_levels_; ++level) {
|
|
3332
|
+
for (size_t pos = 0; pos < files_[level].size(); ++pos) {
|
|
3333
|
+
const FileMetaData* const meta = files_[level][pos];
|
|
3334
|
+
assert(meta);
|
|
3335
|
+
|
|
3336
|
+
const uint64_t file_number = meta->fd.GetNumber();
|
|
3337
|
+
|
|
3338
|
+
assert(file_locations_.find(file_number) == file_locations_.end());
|
|
3339
|
+
file_locations_.emplace(file_number, FileLocation(level, pos));
|
|
3340
|
+
}
|
|
3341
|
+
}
|
|
3342
|
+
}
|
|
3343
|
+
|
|
3030
3344
|
void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
|
|
3031
3345
|
assert(seqnum >= oldest_snapshot_seqnum_);
|
|
3032
3346
|
oldest_snapshot_seqnum_ = seqnum;
|
|
@@ -3040,8 +3354,7 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
|
|
|
3040
3354
|
bottommost_files_mark_threshold_ = kMaxSequenceNumber;
|
|
3041
3355
|
for (auto& level_and_file : bottommost_files_) {
|
|
3042
3356
|
if (!level_and_file.second->being_compacted &&
|
|
3043
|
-
level_and_file.second->fd.largest_seqno != 0
|
|
3044
|
-
level_and_file.second->num_deletions > 1) {
|
|
3357
|
+
level_and_file.second->fd.largest_seqno != 0) {
|
|
3045
3358
|
// largest_seqno might be nonzero due to containing the final key in an
|
|
3046
3359
|
// earlier compaction, whose seqnum we didn't zero out. Multiple deletions
|
|
3047
3360
|
// ensures the file really contains deleted or overwritten keys.
|
|
@@ -3200,7 +3513,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
|
|
|
3200
3513
|
// specified range. From that file, iterate backwards and
|
|
3201
3514
|
// forwards to find all overlapping files.
|
|
3202
3515
|
// if within_range is set, then only store the maximum clean inputs
|
|
3203
|
-
// within range [begin, end]. "clean" means there is a
|
|
3516
|
+
// within range [begin, end]. "clean" means there is a boundary
|
|
3204
3517
|
// between the files in "*inputs" and the surrounding files
|
|
3205
3518
|
void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
|
|
3206
3519
|
int level, const InternalKey* begin, const InternalKey* end,
|
|
@@ -3367,7 +3680,7 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
|
|
|
3367
3680
|
return scratch->buffer;
|
|
3368
3681
|
}
|
|
3369
3682
|
|
|
3370
|
-
|
|
3683
|
+
uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
|
|
3371
3684
|
uint64_t result = 0;
|
|
3372
3685
|
std::vector<FileMetaData*> overlaps;
|
|
3373
3686
|
for (int level = 1; level < num_levels() - 1; level++) {
|
|
@@ -3390,7 +3703,7 @@ uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
|
|
|
3390
3703
|
return level_max_bytes_[level];
|
|
3391
3704
|
}
|
|
3392
3705
|
|
|
3393
|
-
void VersionStorageInfo::CalculateBaseBytes(const
|
|
3706
|
+
void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
|
|
3394
3707
|
const MutableCFOptions& options) {
|
|
3395
3708
|
// Special logic to set number of sorted runs.
|
|
3396
3709
|
// It is to match the previous behavior when all files are in L0.
|
|
@@ -3480,7 +3793,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
|
|
|
3480
3793
|
// base_bytes_min. We set it be base_bytes_min.
|
|
3481
3794
|
base_level_size = base_bytes_min + 1U;
|
|
3482
3795
|
base_level_ = first_non_empty_level;
|
|
3483
|
-
ROCKS_LOG_INFO(ioptions.
|
|
3796
|
+
ROCKS_LOG_INFO(ioptions.logger,
|
|
3484
3797
|
"More existing levels in DB than needed. "
|
|
3485
3798
|
"max_bytes_for_level_multiplier may not be guaranteed.");
|
|
3486
3799
|
} else {
|
|
@@ -3511,7 +3824,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
|
|
|
3511
3824
|
// 1. the L0 size is larger than level size base, or
|
|
3512
3825
|
// 2. number of L0 files reaches twice the L0->L1 compaction trigger
|
|
3513
3826
|
// We don't do this otherwise to keep the LSM-tree structure stable
|
|
3514
|
-
// unless the L0
|
|
3827
|
+
// unless the L0 compaction is backlogged.
|
|
3515
3828
|
base_level_size = l0_size;
|
|
3516
3829
|
if (base_level_ == num_levels_ - 1) {
|
|
3517
3830
|
level_multiplier_ = 1.0;
|
|
@@ -3570,6 +3883,16 @@ uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
|
|
|
3570
3883
|
}
|
|
3571
3884
|
}
|
|
3572
3885
|
}
|
|
3886
|
+
|
|
3887
|
+
// For BlobDB, the result also includes the exact value of live bytes in the
|
|
3888
|
+
// blob files of the version.
|
|
3889
|
+
for (const auto& meta : blob_files_) {
|
|
3890
|
+
assert(meta);
|
|
3891
|
+
|
|
3892
|
+
size += meta->GetTotalBlobBytes();
|
|
3893
|
+
size -= meta->GetGarbageBlobBytes();
|
|
3894
|
+
}
|
|
3895
|
+
|
|
3573
3896
|
return size;
|
|
3574
3897
|
}
|
|
3575
3898
|
|
|
@@ -3619,8 +3942,7 @@ void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
|
|
|
3619
3942
|
}
|
|
3620
3943
|
|
|
3621
3944
|
const auto& blob_files = storage_info_.GetBlobFiles();
|
|
3622
|
-
for (const auto&
|
|
3623
|
-
const auto& meta = pair.second;
|
|
3945
|
+
for (const auto& meta : blob_files) {
|
|
3624
3946
|
assert(meta);
|
|
3625
3947
|
|
|
3626
3948
|
live_blob_files->emplace_back(meta->GetBlobFileNumber());
|
|
@@ -3677,8 +3999,7 @@ std::string Version::DebugString(bool hex, bool print_stats) const {
|
|
|
3677
3999
|
r.append("--- blob files --- version# ");
|
|
3678
4000
|
AppendNumberTo(&r, version_number_);
|
|
3679
4001
|
r.append(" ---\n");
|
|
3680
|
-
for (const auto&
|
|
3681
|
-
const auto& blob_file_meta = pair.second;
|
|
4002
|
+
for (const auto& blob_file_meta : blob_files) {
|
|
3682
4003
|
assert(blob_file_meta);
|
|
3683
4004
|
|
|
3684
4005
|
r.append(blob_file_meta->DebugString());
|
|
@@ -3774,19 +4095,22 @@ VersionSet::VersionSet(const std::string& dbname,
|
|
|
3774
4095
|
WriteBufferManager* write_buffer_manager,
|
|
3775
4096
|
WriteController* write_controller,
|
|
3776
4097
|
BlockCacheTracer* const block_cache_tracer,
|
|
3777
|
-
const std::shared_ptr<IOTracer>& io_tracer
|
|
4098
|
+
const std::shared_ptr<IOTracer>& io_tracer,
|
|
4099
|
+
const std::string& db_session_id)
|
|
3778
4100
|
: column_family_set_(
|
|
3779
4101
|
new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
|
|
3780
4102
|
write_buffer_manager, write_controller,
|
|
3781
|
-
block_cache_tracer, io_tracer)),
|
|
4103
|
+
block_cache_tracer, io_tracer, db_session_id)),
|
|
3782
4104
|
table_cache_(table_cache),
|
|
3783
4105
|
env_(_db_options->env),
|
|
3784
4106
|
fs_(_db_options->fs, io_tracer),
|
|
4107
|
+
clock_(_db_options->clock),
|
|
3785
4108
|
dbname_(dbname),
|
|
3786
4109
|
db_options_(_db_options),
|
|
3787
4110
|
next_file_number_(2),
|
|
3788
4111
|
manifest_file_number_(0), // Filled by Recover()
|
|
3789
4112
|
options_file_number_(0),
|
|
4113
|
+
options_file_size_(0),
|
|
3790
4114
|
pending_manifest_file_number_(0),
|
|
3791
4115
|
last_sequence_(0),
|
|
3792
4116
|
last_allocated_sequence_(0),
|
|
@@ -3796,7 +4120,8 @@ VersionSet::VersionSet(const std::string& dbname,
|
|
|
3796
4120
|
manifest_file_size_(0),
|
|
3797
4121
|
file_options_(storage_options),
|
|
3798
4122
|
block_cache_tracer_(block_cache_tracer),
|
|
3799
|
-
io_tracer_(io_tracer)
|
|
4123
|
+
io_tracer_(io_tracer),
|
|
4124
|
+
db_session_id_(db_session_id) {}
|
|
3800
4125
|
|
|
3801
4126
|
VersionSet::~VersionSet() {
|
|
3802
4127
|
// we need to delete column_family_set_ because its destructor depends on
|
|
@@ -3817,13 +4142,13 @@ void VersionSet::Reset() {
|
|
|
3817
4142
|
if (column_family_set_) {
|
|
3818
4143
|
WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
|
|
3819
4144
|
WriteController* wc = column_family_set_->write_controller();
|
|
3820
|
-
column_family_set_.reset(
|
|
3821
|
-
|
|
3822
|
-
|
|
4145
|
+
column_family_set_.reset(new ColumnFamilySet(
|
|
4146
|
+
dbname_, db_options_, file_options_, table_cache_, wbm, wc,
|
|
4147
|
+
block_cache_tracer_, io_tracer_, db_session_id_));
|
|
3823
4148
|
}
|
|
3824
4149
|
db_id_.clear();
|
|
3825
4150
|
next_file_number_.store(2);
|
|
3826
|
-
|
|
4151
|
+
min_log_number_to_keep_.store(0);
|
|
3827
4152
|
manifest_file_number_ = 0;
|
|
3828
4153
|
options_file_number_ = 0;
|
|
3829
4154
|
pending_manifest_file_number_ = 0;
|
|
@@ -3885,9 +4210,16 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
3885
4210
|
autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
|
|
3886
4211
|
std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
|
|
3887
4212
|
|
|
4213
|
+
// Tracking `max_last_sequence` is needed to ensure we write
|
|
4214
|
+
// `VersionEdit::last_sequence_`s in non-decreasing order according to the
|
|
4215
|
+
// recovery code's requirement. It also allows us to defer updating
|
|
4216
|
+
// `descriptor_last_sequence_` until the apply phase, after the log phase
|
|
4217
|
+
// succeeds.
|
|
4218
|
+
SequenceNumber max_last_sequence = descriptor_last_sequence_;
|
|
4219
|
+
|
|
3888
4220
|
if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
|
|
3889
4221
|
// No group commits for column family add or drop
|
|
3890
|
-
LogAndApplyCFHelper(first_writer.edit_list.front());
|
|
4222
|
+
LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
|
|
3891
4223
|
batch_edits.push_back(first_writer.edit_list.front());
|
|
3892
4224
|
} else {
|
|
3893
4225
|
auto it = manifest_writers_.cbegin();
|
|
@@ -3975,7 +4307,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
3975
4307
|
} else if (group_start != std::numeric_limits<size_t>::max()) {
|
|
3976
4308
|
group_start = std::numeric_limits<size_t>::max();
|
|
3977
4309
|
}
|
|
3978
|
-
Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
|
|
4310
|
+
Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
|
|
4311
|
+
&max_last_sequence, mu);
|
|
3979
4312
|
if (!s.ok()) {
|
|
3980
4313
|
// free up the allocated memory
|
|
3981
4314
|
for (auto v : versions) {
|
|
@@ -4076,10 +4409,11 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4076
4409
|
uint64_t new_manifest_file_size = 0;
|
|
4077
4410
|
Status s;
|
|
4078
4411
|
IOStatus io_s;
|
|
4412
|
+
IOStatus manifest_io_status;
|
|
4079
4413
|
{
|
|
4080
4414
|
FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
|
|
4081
4415
|
mu->Unlock();
|
|
4082
|
-
|
|
4416
|
+
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
|
|
4083
4417
|
TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
|
|
4084
4418
|
if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
|
|
4085
4419
|
for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
|
|
@@ -4092,7 +4426,7 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4092
4426
|
cfd->internal_stats(), 1 /* max_threads */,
|
|
4093
4427
|
true /* prefetch_index_and_filter_in_cache */,
|
|
4094
4428
|
false /* is_initial_load */,
|
|
4095
|
-
mutable_cf_options_ptrs[i]->prefix_extractor
|
|
4429
|
+
mutable_cf_options_ptrs[i]->prefix_extractor,
|
|
4096
4430
|
MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
|
|
4097
4431
|
if (!s.ok()) {
|
|
4098
4432
|
if (db_options_->paranoid_checks) {
|
|
@@ -4117,23 +4451,28 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4117
4451
|
if (io_s.ok()) {
|
|
4118
4452
|
descriptor_file->SetPreallocationBlockSize(
|
|
4119
4453
|
db_options_->manifest_preallocation_size);
|
|
4120
|
-
|
|
4454
|
+
FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
|
|
4121
4455
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
4122
|
-
std::move(descriptor_file), descriptor_fname, opt_file_opts,
|
|
4123
|
-
io_tracer_, nullptr, db_options_->listeners
|
|
4456
|
+
std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
|
|
4457
|
+
io_tracer_, nullptr, db_options_->listeners, nullptr,
|
|
4458
|
+
tmp_set.Contains(FileType::kDescriptorFile),
|
|
4459
|
+
tmp_set.Contains(FileType::kDescriptorFile)));
|
|
4124
4460
|
descriptor_log_.reset(
|
|
4125
4461
|
new log::Writer(std::move(file_writer), 0, false));
|
|
4126
4462
|
s = WriteCurrentStateToManifest(curr_state, wal_additions,
|
|
4127
4463
|
descriptor_log_.get(), io_s);
|
|
4128
4464
|
} else {
|
|
4465
|
+
manifest_io_status = io_s;
|
|
4129
4466
|
s = io_s;
|
|
4130
4467
|
}
|
|
4131
4468
|
}
|
|
4132
4469
|
|
|
4133
4470
|
if (s.ok()) {
|
|
4134
4471
|
if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
|
|
4472
|
+
constexpr bool update_stats = true;
|
|
4473
|
+
|
|
4135
4474
|
for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
|
|
4136
|
-
versions[i]->
|
|
4475
|
+
versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats);
|
|
4137
4476
|
}
|
|
4138
4477
|
}
|
|
4139
4478
|
|
|
@@ -4148,8 +4487,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4148
4487
|
e->DebugString(true));
|
|
4149
4488
|
break;
|
|
4150
4489
|
}
|
|
4151
|
-
|
|
4152
|
-
|
|
4490
|
+
TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
|
|
4491
|
+
REDUCE_ODDS2);
|
|
4153
4492
|
#ifndef NDEBUG
|
|
4154
4493
|
if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
|
|
4155
4494
|
TEST_SYNC_POINT_CALLBACK(
|
|
@@ -4163,11 +4502,13 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4163
4502
|
io_s = descriptor_log_->AddRecord(record);
|
|
4164
4503
|
if (!io_s.ok()) {
|
|
4165
4504
|
s = io_s;
|
|
4505
|
+
manifest_io_status = io_s;
|
|
4166
4506
|
break;
|
|
4167
4507
|
}
|
|
4168
4508
|
}
|
|
4169
4509
|
if (s.ok()) {
|
|
4170
|
-
io_s = SyncManifest(
|
|
4510
|
+
io_s = SyncManifest(db_options_, descriptor_log_->file());
|
|
4511
|
+
manifest_io_status = io_s;
|
|
4171
4512
|
TEST_SYNC_POINT_CALLBACK(
|
|
4172
4513
|
"VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
|
|
4173
4514
|
}
|
|
@@ -4180,13 +4521,15 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4180
4521
|
|
|
4181
4522
|
// If we just created a new descriptor file, install it by writing a
|
|
4182
4523
|
// new CURRENT file that points to it.
|
|
4524
|
+
if (s.ok()) {
|
|
4525
|
+
assert(manifest_io_status.ok());
|
|
4526
|
+
}
|
|
4183
4527
|
if (s.ok() && new_descriptor_log) {
|
|
4184
4528
|
io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
|
|
4185
4529
|
db_directory);
|
|
4186
4530
|
if (!io_s.ok()) {
|
|
4187
4531
|
s = io_s;
|
|
4188
4532
|
}
|
|
4189
|
-
TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest");
|
|
4190
4533
|
}
|
|
4191
4534
|
|
|
4192
4535
|
if (s.ok()) {
|
|
@@ -4239,9 +4582,11 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4239
4582
|
if (first_writer.edit_list.front()->is_column_family_add_) {
|
|
4240
4583
|
assert(batch_edits.size() == 1);
|
|
4241
4584
|
assert(new_cf_options != nullptr);
|
|
4585
|
+
assert(max_last_sequence == descriptor_last_sequence_);
|
|
4242
4586
|
CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
|
|
4243
4587
|
} else if (first_writer.edit_list.front()->is_column_family_drop_) {
|
|
4244
4588
|
assert(batch_edits.size() == 1);
|
|
4589
|
+
assert(max_last_sequence == descriptor_last_sequence_);
|
|
4245
4590
|
first_writer.cfd->SetDropped();
|
|
4246
4591
|
first_writer.cfd->UnrefAndTryDelete();
|
|
4247
4592
|
} else {
|
|
@@ -4272,8 +4617,7 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4272
4617
|
}
|
|
4273
4618
|
|
|
4274
4619
|
if (last_min_log_number_to_keep != 0) {
|
|
4275
|
-
|
|
4276
|
-
MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
|
|
4620
|
+
MarkMinLogNumberToKeep(last_min_log_number_to_keep);
|
|
4277
4621
|
}
|
|
4278
4622
|
|
|
4279
4623
|
for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
|
|
@@ -4281,6 +4625,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4281
4625
|
AppendVersion(cfd, versions[i]);
|
|
4282
4626
|
}
|
|
4283
4627
|
}
|
|
4628
|
+
assert(max_last_sequence >= descriptor_last_sequence_);
|
|
4629
|
+
descriptor_last_sequence_ = max_last_sequence;
|
|
4284
4630
|
manifest_file_number_ = pending_manifest_file_number_;
|
|
4285
4631
|
manifest_file_size_ = new_manifest_file_size;
|
|
4286
4632
|
prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
|
|
@@ -4295,11 +4641,41 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4295
4641
|
for (auto v : versions) {
|
|
4296
4642
|
delete v;
|
|
4297
4643
|
}
|
|
4644
|
+
if (manifest_io_status.ok()) {
|
|
4645
|
+
manifest_file_number_ = pending_manifest_file_number_;
|
|
4646
|
+
manifest_file_size_ = new_manifest_file_size;
|
|
4647
|
+
}
|
|
4298
4648
|
// If manifest append failed for whatever reason, the file could be
|
|
4299
4649
|
// corrupted. So we need to force the next version update to start a
|
|
4300
4650
|
// new manifest file.
|
|
4301
4651
|
descriptor_log_.reset();
|
|
4302
|
-
|
|
4652
|
+
// If manifest operations failed, then we know the CURRENT file still
|
|
4653
|
+
// points to the original MANIFEST. Therefore, we can safely delete the
|
|
4654
|
+
// new MANIFEST.
|
|
4655
|
+
// If manifest operations succeeded, and we are here, then it is possible
|
|
4656
|
+
// that renaming tmp file to CURRENT failed.
|
|
4657
|
+
//
|
|
4658
|
+
// On local POSIX-compliant FS, the CURRENT must point to the original
|
|
4659
|
+
// MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
|
|
4660
|
+
// keep it. Future recovery will ignore this MANIFEST. It's also ok for the
|
|
4661
|
+
// process not to crash and continue using the db. Any future LogAndApply()
|
|
4662
|
+
// call will switch to a new MANIFEST and update CURRENT, still ignoring
|
|
4663
|
+
// this one.
|
|
4664
|
+
//
|
|
4665
|
+
// On non-local FS, it is
|
|
4666
|
+
// possible that the rename operation succeeded on the server (remote)
|
|
4667
|
+
// side, but the client somehow returns a non-ok status to RocksDB. Note
|
|
4668
|
+
// that this does not violate atomicity. Should we delete the new MANIFEST
|
|
4669
|
+
// successfully, a subsequent recovery attempt will likely see the CURRENT
|
|
4670
|
+
// pointing to the new MANIFEST, thus fail. We will not be able to open the
|
|
4671
|
+
// DB again. Therefore, if manifest operations succeed, we should keep the
|
|
4672
|
+
// the new MANIFEST. If the process proceeds, any future LogAndApply() call
|
|
4673
|
+
// will switch to a new MANIFEST and update CURRENT. If user tries to
|
|
4674
|
+
// re-open the DB,
|
|
4675
|
+
// a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
|
|
4676
|
+
// b) CURRENT points to the original MANIFEST, and the original MANIFEST
|
|
4677
|
+
// also exists.
|
|
4678
|
+
if (new_descriptor_log && !manifest_io_status.ok()) {
|
|
4303
4679
|
ROCKS_LOG_INFO(db_options_->info_log,
|
|
4304
4680
|
"Deleting manifest %" PRIu64 " current manifest %" PRIu64
|
|
4305
4681
|
"\n",
|
|
@@ -4317,6 +4693,23 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4317
4693
|
|
|
4318
4694
|
pending_manifest_file_number_ = 0;
|
|
4319
4695
|
|
|
4696
|
+
#ifndef NDEBUG
|
|
4697
|
+
// This is here kind of awkwardly because there's no other consistency
|
|
4698
|
+
// checks on `VersionSet`'s updates for the new `Version`s. We might want
|
|
4699
|
+
// to move it to a dedicated function, or remove it if we gain enough
|
|
4700
|
+
// confidence in `descriptor_last_sequence_`.
|
|
4701
|
+
if (s.ok()) {
|
|
4702
|
+
for (const auto* v : versions) {
|
|
4703
|
+
const auto* vstorage = v->storage_info();
|
|
4704
|
+
for (int level = 0; level < vstorage->num_levels(); ++level) {
|
|
4705
|
+
for (const auto& file : vstorage->LevelFiles(level)) {
|
|
4706
|
+
assert(file->fd.largest_seqno <= descriptor_last_sequence_);
|
|
4707
|
+
}
|
|
4708
|
+
}
|
|
4709
|
+
}
|
|
4710
|
+
}
|
|
4711
|
+
#endif // NDEBUG
|
|
4712
|
+
|
|
4320
4713
|
// wake up all the waiting writers
|
|
4321
4714
|
while (true) {
|
|
4322
4715
|
ManifestWriter* ready = manifest_writers_.front();
|
|
@@ -4346,7 +4739,15 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4346
4739
|
return s;
|
|
4347
4740
|
}
|
|
4348
4741
|
|
|
4349
|
-
|
|
4742
|
+
void VersionSet::WakeUpWaitingManifestWriters() {
|
|
4743
|
+
// wake up all the waiting writers
|
|
4744
|
+
// Notify new head of manifest write queue.
|
|
4745
|
+
if (!manifest_writers_.empty()) {
|
|
4746
|
+
manifest_writers_.front()->cv.Signal();
|
|
4747
|
+
}
|
|
4748
|
+
}
|
|
4749
|
+
|
|
4750
|
+
// 'datas' is grammatically incorrect. We still use this notation to indicate
|
|
4350
4751
|
// that this variable represents a collection of column_family_data.
|
|
4351
4752
|
Status VersionSet::LogAndApply(
|
|
4352
4753
|
const autovector<ColumnFamilyData*>& column_family_datas,
|
|
@@ -4432,16 +4833,13 @@ Status VersionSet::LogAndApply(
|
|
|
4432
4833
|
new_cf_options);
|
|
4433
4834
|
}
|
|
4434
4835
|
|
|
4435
|
-
void VersionSet::LogAndApplyCFHelper(VersionEdit* edit
|
|
4836
|
+
void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
|
|
4837
|
+
SequenceNumber* max_last_sequence) {
|
|
4838
|
+
assert(max_last_sequence != nullptr);
|
|
4436
4839
|
assert(edit->IsColumnFamilyManipulation());
|
|
4437
4840
|
edit->SetNextFile(next_file_number_.load());
|
|
4438
|
-
|
|
4439
|
-
|
|
4440
|
-
// expecting some new data that is not written yet. Since LastSequence is an
|
|
4441
|
-
// upper bound on the sequence, it is ok to record
|
|
4442
|
-
// last_allocated_sequence_ as the last sequence.
|
|
4443
|
-
edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
|
|
4444
|
-
: last_sequence_);
|
|
4841
|
+
assert(!edit->HasLastSequence());
|
|
4842
|
+
edit->SetLastSequence(*max_last_sequence);
|
|
4445
4843
|
if (edit->is_column_family_drop_) {
|
|
4446
4844
|
// if we drop column family, we have to make sure to save max column family,
|
|
4447
4845
|
// so that we don't reuse existing ID
|
|
@@ -4451,12 +4849,14 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
|
|
|
4451
4849
|
|
|
4452
4850
|
Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
|
|
4453
4851
|
VersionBuilder* builder, VersionEdit* edit,
|
|
4852
|
+
SequenceNumber* max_last_sequence,
|
|
4454
4853
|
InstrumentedMutex* mu) {
|
|
4455
4854
|
#ifdef NDEBUG
|
|
4456
4855
|
(void)cfd;
|
|
4457
4856
|
#endif
|
|
4458
4857
|
mu->AssertHeld();
|
|
4459
4858
|
assert(!edit->IsColumnFamilyManipulation());
|
|
4859
|
+
assert(max_last_sequence != nullptr);
|
|
4460
4860
|
|
|
4461
4861
|
if (edit->has_log_number_) {
|
|
4462
4862
|
assert(edit->log_number_ >= cfd->GetLogNumber());
|
|
@@ -4467,13 +4867,11 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
|
|
|
4467
4867
|
edit->SetPrevLogNumber(prev_log_number_);
|
|
4468
4868
|
}
|
|
4469
4869
|
edit->SetNextFile(next_file_number_.load());
|
|
4470
|
-
|
|
4471
|
-
|
|
4472
|
-
|
|
4473
|
-
|
|
4474
|
-
|
|
4475
|
-
edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
|
|
4476
|
-
: last_sequence_);
|
|
4870
|
+
if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
|
|
4871
|
+
*max_last_sequence = edit->GetLastSequence();
|
|
4872
|
+
} else {
|
|
4873
|
+
edit->SetLastSequence(*max_last_sequence);
|
|
4874
|
+
}
|
|
4477
4875
|
|
|
4478
4876
|
// The builder can be nullptr only if edit is WAL manipulation,
|
|
4479
4877
|
// because WAL edits do not need to be applied to versions,
|
|
@@ -4482,171 +4880,13 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
|
|
|
4482
4880
|
return builder ? builder->Apply(edit) : Status::OK();
|
|
4483
4881
|
}
|
|
4484
4882
|
|
|
4485
|
-
Status VersionSet::
|
|
4486
|
-
|
|
4487
|
-
|
|
4488
|
-
|
|
4489
|
-
|
|
4490
|
-
|
|
4491
|
-
|
|
4492
|
-
// Not found means that user didn't supply that column
|
|
4493
|
-
// family option AND we encountered column family add
|
|
4494
|
-
// record. Once we encounter column family drop record,
|
|
4495
|
-
// we will delete the column family from
|
|
4496
|
-
// column_families_not_found.
|
|
4497
|
-
bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
|
|
4498
|
-
column_families_not_found.end());
|
|
4499
|
-
// in builders means that user supplied that column family
|
|
4500
|
-
// option AND that we encountered column family add record
|
|
4501
|
-
bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
|
|
4502
|
-
|
|
4503
|
-
// they can't both be true
|
|
4504
|
-
assert(!(cf_in_not_found && cf_in_builders));
|
|
4505
|
-
|
|
4506
|
-
ColumnFamilyData* cfd = nullptr;
|
|
4507
|
-
|
|
4508
|
-
if (edit.is_column_family_add_) {
|
|
4509
|
-
if (cf_in_builders || cf_in_not_found) {
|
|
4510
|
-
return Status::Corruption(
|
|
4511
|
-
"Manifest adding the same column family twice: " +
|
|
4512
|
-
edit.column_family_name_);
|
|
4513
|
-
}
|
|
4514
|
-
auto cf_options = name_to_options.find(edit.column_family_name_);
|
|
4515
|
-
// implicitly add persistent_stats column family without requiring user
|
|
4516
|
-
// to specify
|
|
4517
|
-
bool is_persistent_stats_column_family =
|
|
4518
|
-
edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
|
|
4519
|
-
if (cf_options == name_to_options.end() &&
|
|
4520
|
-
!is_persistent_stats_column_family) {
|
|
4521
|
-
column_families_not_found.insert(
|
|
4522
|
-
{edit.column_family_, edit.column_family_name_});
|
|
4523
|
-
} else {
|
|
4524
|
-
// recover persistent_stats CF from a DB that already contains it
|
|
4525
|
-
if (is_persistent_stats_column_family) {
|
|
4526
|
-
ColumnFamilyOptions cfo;
|
|
4527
|
-
OptimizeForPersistentStats(&cfo);
|
|
4528
|
-
cfd = CreateColumnFamily(cfo, &edit);
|
|
4529
|
-
} else {
|
|
4530
|
-
cfd = CreateColumnFamily(cf_options->second, &edit);
|
|
4531
|
-
}
|
|
4532
|
-
cfd->set_initialized();
|
|
4533
|
-
builders.insert(std::make_pair(
|
|
4534
|
-
edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
|
|
4535
|
-
new BaseReferencedVersionBuilder(cfd))));
|
|
4536
|
-
}
|
|
4537
|
-
} else if (edit.is_column_family_drop_) {
|
|
4538
|
-
if (cf_in_builders) {
|
|
4539
|
-
auto builder = builders.find(edit.column_family_);
|
|
4540
|
-
assert(builder != builders.end());
|
|
4541
|
-
builders.erase(builder);
|
|
4542
|
-
cfd = column_family_set_->GetColumnFamily(edit.column_family_);
|
|
4543
|
-
assert(cfd != nullptr);
|
|
4544
|
-
if (cfd->UnrefAndTryDelete()) {
|
|
4545
|
-
cfd = nullptr;
|
|
4546
|
-
} else {
|
|
4547
|
-
// who else can have reference to cfd!?
|
|
4548
|
-
assert(false);
|
|
4549
|
-
}
|
|
4550
|
-
} else if (cf_in_not_found) {
|
|
4551
|
-
column_families_not_found.erase(edit.column_family_);
|
|
4552
|
-
} else {
|
|
4553
|
-
return Status::Corruption(
|
|
4554
|
-
"Manifest - dropping non-existing column family");
|
|
4555
|
-
}
|
|
4556
|
-
} else if (edit.IsWalAddition()) {
|
|
4557
|
-
Status s = wals_.AddWals(edit.GetWalAdditions());
|
|
4558
|
-
if (!s.ok()) {
|
|
4559
|
-
return s;
|
|
4560
|
-
}
|
|
4561
|
-
} else if (edit.IsWalDeletion()) {
|
|
4562
|
-
Status s = wals_.DeleteWalsBefore(edit.GetWalDeletion().GetLogNumber());
|
|
4563
|
-
if (!s.ok()) {
|
|
4564
|
-
return s;
|
|
4565
|
-
}
|
|
4566
|
-
} else if (!cf_in_not_found) {
|
|
4567
|
-
if (!cf_in_builders) {
|
|
4568
|
-
return Status::Corruption(
|
|
4569
|
-
"Manifest record referencing unknown column family");
|
|
4570
|
-
}
|
|
4571
|
-
|
|
4572
|
-
cfd = column_family_set_->GetColumnFamily(edit.column_family_);
|
|
4573
|
-
// this should never happen since cf_in_builders is true
|
|
4574
|
-
assert(cfd != nullptr);
|
|
4575
|
-
|
|
4576
|
-
// if it is not column family add or column family drop,
|
|
4577
|
-
// then it's a file add/delete, which should be forwarded
|
|
4578
|
-
// to builder
|
|
4579
|
-
auto builder = builders.find(edit.column_family_);
|
|
4580
|
-
assert(builder != builders.end());
|
|
4581
|
-
Status s = builder->second->version_builder()->Apply(&edit);
|
|
4582
|
-
if (!s.ok()) {
|
|
4583
|
-
return s;
|
|
4584
|
-
}
|
|
4585
|
-
}
|
|
4586
|
-
return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
|
|
4587
|
-
}
|
|
4588
|
-
|
|
4589
|
-
Status VersionSet::ExtractInfoFromVersionEdit(
|
|
4590
|
-
ColumnFamilyData* cfd, const VersionEdit& from_edit,
|
|
4591
|
-
VersionEditParams* version_edit_params) {
|
|
4592
|
-
if (cfd != nullptr) {
|
|
4593
|
-
if (from_edit.has_db_id_) {
|
|
4594
|
-
version_edit_params->SetDBId(from_edit.db_id_);
|
|
4595
|
-
}
|
|
4596
|
-
if (from_edit.has_log_number_) {
|
|
4597
|
-
if (cfd->GetLogNumber() > from_edit.log_number_) {
|
|
4598
|
-
ROCKS_LOG_WARN(
|
|
4599
|
-
db_options_->info_log,
|
|
4600
|
-
"MANIFEST corruption detected, but ignored - Log numbers in "
|
|
4601
|
-
"records NOT monotonically increasing");
|
|
4602
|
-
} else {
|
|
4603
|
-
cfd->SetLogNumber(from_edit.log_number_);
|
|
4604
|
-
version_edit_params->SetLogNumber(from_edit.log_number_);
|
|
4605
|
-
}
|
|
4606
|
-
}
|
|
4607
|
-
if (from_edit.has_comparator_ &&
|
|
4608
|
-
from_edit.comparator_ != cfd->user_comparator()->Name()) {
|
|
4609
|
-
return Status::InvalidArgument(
|
|
4610
|
-
cfd->user_comparator()->Name(),
|
|
4611
|
-
"does not match existing comparator " + from_edit.comparator_);
|
|
4612
|
-
}
|
|
4613
|
-
if (from_edit.HasFullHistoryTsLow()) {
|
|
4614
|
-
const std::string& new_ts = from_edit.GetFullHistoryTsLow();
|
|
4615
|
-
cfd->SetFullHistoryTsLow(new_ts);
|
|
4616
|
-
}
|
|
4617
|
-
}
|
|
4618
|
-
|
|
4619
|
-
if (from_edit.has_prev_log_number_) {
|
|
4620
|
-
version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
|
|
4621
|
-
}
|
|
4622
|
-
|
|
4623
|
-
if (from_edit.has_next_file_number_) {
|
|
4624
|
-
version_edit_params->SetNextFile(from_edit.next_file_number_);
|
|
4625
|
-
}
|
|
4626
|
-
|
|
4627
|
-
if (from_edit.has_max_column_family_) {
|
|
4628
|
-
version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
|
|
4629
|
-
}
|
|
4630
|
-
|
|
4631
|
-
if (from_edit.has_min_log_number_to_keep_) {
|
|
4632
|
-
version_edit_params->min_log_number_to_keep_ =
|
|
4633
|
-
std::max(version_edit_params->min_log_number_to_keep_,
|
|
4634
|
-
from_edit.min_log_number_to_keep_);
|
|
4635
|
-
}
|
|
4636
|
-
|
|
4637
|
-
if (from_edit.has_last_sequence_) {
|
|
4638
|
-
version_edit_params->SetLastSequence(from_edit.last_sequence_);
|
|
4639
|
-
}
|
|
4640
|
-
return Status::OK();
|
|
4641
|
-
}
|
|
4642
|
-
|
|
4643
|
-
Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
|
|
4644
|
-
FileSystem* fs,
|
|
4645
|
-
std::string* manifest_path,
|
|
4646
|
-
uint64_t* manifest_file_number) {
|
|
4647
|
-
assert(fs != nullptr);
|
|
4648
|
-
assert(manifest_path != nullptr);
|
|
4649
|
-
assert(manifest_file_number != nullptr);
|
|
4883
|
+
Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
|
|
4884
|
+
FileSystem* fs,
|
|
4885
|
+
std::string* manifest_path,
|
|
4886
|
+
uint64_t* manifest_file_number) {
|
|
4887
|
+
assert(fs != nullptr);
|
|
4888
|
+
assert(manifest_path != nullptr);
|
|
4889
|
+
assert(manifest_file_number != nullptr);
|
|
4650
4890
|
|
|
4651
4891
|
std::string fname;
|
|
4652
4892
|
Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
|
|
@@ -4671,77 +4911,6 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
|
|
|
4671
4911
|
return Status::OK();
|
|
4672
4912
|
}
|
|
4673
4913
|
|
|
4674
|
-
Status VersionSet::ReadAndRecover(
|
|
4675
|
-
log::Reader& reader, AtomicGroupReadBuffer* read_buffer,
|
|
4676
|
-
const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
|
|
4677
|
-
std::unordered_map<int, std::string>& column_families_not_found,
|
|
4678
|
-
std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
|
|
4679
|
-
builders,
|
|
4680
|
-
Status* log_read_status, VersionEditParams* version_edit_params,
|
|
4681
|
-
std::string* db_id) {
|
|
4682
|
-
assert(read_buffer != nullptr);
|
|
4683
|
-
assert(log_read_status != nullptr);
|
|
4684
|
-
Status s;
|
|
4685
|
-
Slice record;
|
|
4686
|
-
std::string scratch;
|
|
4687
|
-
size_t recovered_edits = 0;
|
|
4688
|
-
while (s.ok() && reader.ReadRecord(&record, &scratch) &&
|
|
4689
|
-
log_read_status->ok()) {
|
|
4690
|
-
VersionEdit edit;
|
|
4691
|
-
s = edit.DecodeFrom(record);
|
|
4692
|
-
if (!s.ok()) {
|
|
4693
|
-
break;
|
|
4694
|
-
}
|
|
4695
|
-
if (edit.has_db_id_) {
|
|
4696
|
-
db_id_ = edit.GetDbId();
|
|
4697
|
-
if (db_id != nullptr) {
|
|
4698
|
-
db_id->assign(edit.GetDbId());
|
|
4699
|
-
}
|
|
4700
|
-
}
|
|
4701
|
-
s = read_buffer->AddEdit(&edit);
|
|
4702
|
-
if (!s.ok()) {
|
|
4703
|
-
break;
|
|
4704
|
-
}
|
|
4705
|
-
if (edit.is_in_atomic_group_) {
|
|
4706
|
-
if (read_buffer->IsFull()) {
|
|
4707
|
-
// Apply edits in an atomic group when we have read all edits in the
|
|
4708
|
-
// group.
|
|
4709
|
-
for (auto& e : read_buffer->replay_buffer()) {
|
|
4710
|
-
s = ApplyOneVersionEditToBuilder(e, name_to_options,
|
|
4711
|
-
column_families_not_found, builders,
|
|
4712
|
-
version_edit_params);
|
|
4713
|
-
if (!s.ok()) {
|
|
4714
|
-
break;
|
|
4715
|
-
}
|
|
4716
|
-
recovered_edits++;
|
|
4717
|
-
}
|
|
4718
|
-
if (!s.ok()) {
|
|
4719
|
-
break;
|
|
4720
|
-
}
|
|
4721
|
-
read_buffer->Clear();
|
|
4722
|
-
}
|
|
4723
|
-
} else {
|
|
4724
|
-
// Apply a normal edit immediately.
|
|
4725
|
-
s = ApplyOneVersionEditToBuilder(edit, name_to_options,
|
|
4726
|
-
column_families_not_found, builders,
|
|
4727
|
-
version_edit_params);
|
|
4728
|
-
if (s.ok()) {
|
|
4729
|
-
recovered_edits++;
|
|
4730
|
-
}
|
|
4731
|
-
}
|
|
4732
|
-
}
|
|
4733
|
-
if (!log_read_status->ok()) {
|
|
4734
|
-
s = *log_read_status;
|
|
4735
|
-
}
|
|
4736
|
-
if (!s.ok()) {
|
|
4737
|
-
// Clear the buffer if we fail to decode/apply an edit.
|
|
4738
|
-
read_buffer->Clear();
|
|
4739
|
-
}
|
|
4740
|
-
TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
|
|
4741
|
-
&recovered_edits);
|
|
4742
|
-
return s;
|
|
4743
|
-
}
|
|
4744
|
-
|
|
4745
4914
|
Status VersionSet::Recover(
|
|
4746
4915
|
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
|
|
4747
4916
|
std::string* db_id) {
|
|
@@ -4765,9 +4934,9 @@ Status VersionSet::Recover(
|
|
|
4765
4934
|
if (!s.ok()) {
|
|
4766
4935
|
return s;
|
|
4767
4936
|
}
|
|
4768
|
-
manifest_file_reader.reset(
|
|
4769
|
-
|
|
4770
|
-
|
|
4937
|
+
manifest_file_reader.reset(new SequentialFileReader(
|
|
4938
|
+
std::move(manifest_file), manifest_path,
|
|
4939
|
+
db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
|
|
4771
4940
|
}
|
|
4772
4941
|
uint64_t current_manifest_file_size = 0;
|
|
4773
4942
|
uint64_t log_number = 0;
|
|
@@ -4777,10 +4946,10 @@ Status VersionSet::Recover(
|
|
|
4777
4946
|
reporter.status = &log_read_status;
|
|
4778
4947
|
log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
|
|
4779
4948
|
true /* checksum */, 0 /* log_number */);
|
|
4780
|
-
VersionEditHandler handler(
|
|
4781
|
-
|
|
4782
|
-
|
|
4783
|
-
|
|
4949
|
+
VersionEditHandler handler(read_only, column_families,
|
|
4950
|
+
const_cast<VersionSet*>(this),
|
|
4951
|
+
/*track_missing_files=*/false,
|
|
4952
|
+
/*no_error_if_files_missing=*/false, io_tracer_);
|
|
4784
4953
|
handler.Iterate(reader, &log_read_status);
|
|
4785
4954
|
s = handler.status();
|
|
4786
4955
|
if (s.ok()) {
|
|
@@ -4802,7 +4971,7 @@ Status VersionSet::Recover(
|
|
|
4802
4971
|
",min_log_number_to_keep is %" PRIu64 "\n",
|
|
4803
4972
|
manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
|
|
4804
4973
|
last_sequence_.load(), log_number, prev_log_number_,
|
|
4805
|
-
column_family_set_->GetMaxColumnFamily(),
|
|
4974
|
+
column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
|
|
4806
4975
|
|
|
4807
4976
|
for (auto cfd : *column_family_set_) {
|
|
4808
4977
|
if (cfd->IsDropped()) {
|
|
@@ -4937,9 +5106,9 @@ Status VersionSet::TryRecoverFromOneManifest(
|
|
|
4937
5106
|
if (!s.ok()) {
|
|
4938
5107
|
return s;
|
|
4939
5108
|
}
|
|
4940
|
-
manifest_file_reader.reset(
|
|
4941
|
-
|
|
4942
|
-
|
|
5109
|
+
manifest_file_reader.reset(new SequentialFileReader(
|
|
5110
|
+
std::move(manifest_file), manifest_path,
|
|
5111
|
+
db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
|
|
4943
5112
|
}
|
|
4944
5113
|
|
|
4945
5114
|
assert(s.ok());
|
|
@@ -4963,9 +5132,6 @@ Status VersionSet::TryRecoverFromOneManifest(
|
|
|
4963
5132
|
Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
|
4964
5133
|
const std::string& dbname,
|
|
4965
5134
|
FileSystem* fs) {
|
|
4966
|
-
// these are just for performance reasons, not correcntes,
|
|
4967
|
-
// so we're fine using the defaults
|
|
4968
|
-
FileOptions soptions;
|
|
4969
5135
|
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
|
4970
5136
|
std::string manifest_path;
|
|
4971
5137
|
uint64_t manifest_file_number;
|
|
@@ -4974,16 +5140,24 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
|
|
4974
5140
|
if (!s.ok()) {
|
|
4975
5141
|
return s;
|
|
4976
5142
|
}
|
|
5143
|
+
return ListColumnFamiliesFromManifest(manifest_path, fs, column_families);
|
|
5144
|
+
}
|
|
4977
5145
|
|
|
5146
|
+
Status VersionSet::ListColumnFamiliesFromManifest(
|
|
5147
|
+
const std::string& manifest_path, FileSystem* fs,
|
|
5148
|
+
std::vector<std::string>* column_families) {
|
|
4978
5149
|
std::unique_ptr<SequentialFileReader> file_reader;
|
|
5150
|
+
Status s;
|
|
4979
5151
|
{
|
|
4980
5152
|
std::unique_ptr<FSSequentialFile> file;
|
|
4981
|
-
|
|
5153
|
+
// these are just for performance reasons, not correctness,
|
|
5154
|
+
// so we're fine using the defaults
|
|
5155
|
+
s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr);
|
|
4982
5156
|
if (!s.ok()) {
|
|
4983
5157
|
return s;
|
|
4984
|
-
|
|
4985
|
-
|
|
4986
|
-
|
|
5158
|
+
}
|
|
5159
|
+
file_reader = std::make_unique<SequentialFileReader>(
|
|
5160
|
+
std::move(file), manifest_path, /*io_tracer=*/nullptr);
|
|
4987
5161
|
}
|
|
4988
5162
|
|
|
4989
5163
|
VersionSet::LogReporter reporter;
|
|
@@ -5022,7 +5196,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
|
|
|
5022
5196
|
WriteController wc(options->delayed_write_rate);
|
|
5023
5197
|
WriteBufferManager wb(options->db_write_buffer_size);
|
|
5024
5198
|
VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
|
|
5025
|
-
nullptr /*BlockCacheTracer*/, nullptr /*IOTracer
|
|
5199
|
+
nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
|
|
5200
|
+
/*db_session_id*/ "");
|
|
5026
5201
|
Status status;
|
|
5027
5202
|
|
|
5028
5203
|
std::vector<ColumnFamilyDescriptor> dummy;
|
|
@@ -5104,7 +5279,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
|
|
|
5104
5279
|
}
|
|
5105
5280
|
|
|
5106
5281
|
// Get the checksum information including the checksum and checksum function
|
|
5107
|
-
// name of all SST files in VersionSet. Store the information in
|
|
5282
|
+
// name of all SST and blob files in VersionSet. Store the information in
|
|
5108
5283
|
// FileChecksumList which contains a map from file number to its checksum info.
|
|
5109
5284
|
// If DB is not running, make sure call VersionSet::Recover() to load the file
|
|
5110
5285
|
// metadata from Manifest to VersionSet before calling this function.
|
|
@@ -5118,35 +5293,70 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
|
|
|
5118
5293
|
checksum_list->reset();
|
|
5119
5294
|
|
|
5120
5295
|
for (auto cfd : *column_family_set_) {
|
|
5296
|
+
assert(cfd);
|
|
5297
|
+
|
|
5121
5298
|
if (cfd->IsDropped() || !cfd->initialized()) {
|
|
5122
5299
|
continue;
|
|
5123
5300
|
}
|
|
5301
|
+
|
|
5302
|
+
const auto* current = cfd->current();
|
|
5303
|
+
assert(current);
|
|
5304
|
+
|
|
5305
|
+
const auto* vstorage = current->storage_info();
|
|
5306
|
+
assert(vstorage);
|
|
5307
|
+
|
|
5308
|
+
/* SST files */
|
|
5124
5309
|
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
|
5125
|
-
|
|
5126
|
-
|
|
5310
|
+
const auto& level_files = vstorage->LevelFiles(level);
|
|
5311
|
+
|
|
5312
|
+
for (const auto& file : level_files) {
|
|
5313
|
+
assert(file);
|
|
5314
|
+
|
|
5127
5315
|
s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
|
|
5128
5316
|
file->file_checksum,
|
|
5129
5317
|
file->file_checksum_func_name);
|
|
5130
5318
|
if (!s.ok()) {
|
|
5131
|
-
|
|
5319
|
+
return s;
|
|
5132
5320
|
}
|
|
5133
5321
|
}
|
|
5322
|
+
}
|
|
5323
|
+
|
|
5324
|
+
/* Blob files */
|
|
5325
|
+
const auto& blob_files = vstorage->GetBlobFiles();
|
|
5326
|
+
for (const auto& meta : blob_files) {
|
|
5327
|
+
assert(meta);
|
|
5328
|
+
|
|
5329
|
+
std::string checksum_value = meta->GetChecksumValue();
|
|
5330
|
+
std::string checksum_method = meta->GetChecksumMethod();
|
|
5331
|
+
assert(checksum_value.empty() == checksum_method.empty());
|
|
5332
|
+
if (meta->GetChecksumMethod().empty()) {
|
|
5333
|
+
checksum_value = kUnknownFileChecksum;
|
|
5334
|
+
checksum_method = kUnknownFileChecksumFuncName;
|
|
5335
|
+
}
|
|
5336
|
+
|
|
5337
|
+
s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(),
|
|
5338
|
+
checksum_value, checksum_method);
|
|
5134
5339
|
if (!s.ok()) {
|
|
5135
|
-
|
|
5340
|
+
return s;
|
|
5136
5341
|
}
|
|
5137
5342
|
}
|
|
5138
|
-
if (!s.ok()) {
|
|
5139
|
-
break;
|
|
5140
|
-
}
|
|
5141
5343
|
}
|
|
5344
|
+
|
|
5142
5345
|
return s;
|
|
5143
5346
|
}
|
|
5144
5347
|
|
|
5145
5348
|
Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
|
5146
5349
|
bool verbose, bool hex, bool json) {
|
|
5350
|
+
assert(options.env);
|
|
5351
|
+
std::vector<std::string> column_families;
|
|
5352
|
+
Status s = ListColumnFamiliesFromManifest(
|
|
5353
|
+
dscname, options.env->GetFileSystem().get(), &column_families);
|
|
5354
|
+
if (!s.ok()) {
|
|
5355
|
+
return s;
|
|
5356
|
+
}
|
|
5357
|
+
|
|
5147
5358
|
// Open the specified manifest file.
|
|
5148
5359
|
std::unique_ptr<SequentialFileReader> file_reader;
|
|
5149
|
-
Status s;
|
|
5150
5360
|
{
|
|
5151
5361
|
std::unique_ptr<FSSequentialFile> file;
|
|
5152
5362
|
const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
|
|
@@ -5157,14 +5367,16 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
|
|
5157
5367
|
if (!s.ok()) {
|
|
5158
5368
|
return s;
|
|
5159
5369
|
}
|
|
5160
|
-
file_reader
|
|
5161
|
-
std::move(file), dscname, db_options_->log_readahead_size, io_tracer_)
|
|
5370
|
+
file_reader = std::make_unique<SequentialFileReader>(
|
|
5371
|
+
std::move(file), dscname, db_options_->log_readahead_size, io_tracer_);
|
|
5162
5372
|
}
|
|
5163
5373
|
|
|
5164
|
-
std::vector<ColumnFamilyDescriptor>
|
|
5165
|
-
|
|
5166
|
-
|
|
5167
|
-
|
|
5374
|
+
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
5375
|
+
for (const auto& cf : column_families) {
|
|
5376
|
+
cf_descs.emplace_back(cf, options);
|
|
5377
|
+
}
|
|
5378
|
+
|
|
5379
|
+
DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json);
|
|
5168
5380
|
{
|
|
5169
5381
|
VersionSet::LogReporter reporter;
|
|
5170
5382
|
reporter.status = &s;
|
|
@@ -5186,9 +5398,9 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
|
|
|
5186
5398
|
}
|
|
5187
5399
|
// Called only either from ::LogAndApply which is protected by mutex or during
|
|
5188
5400
|
// recovery which is single-threaded.
|
|
5189
|
-
void VersionSet::
|
|
5190
|
-
if (
|
|
5191
|
-
|
|
5401
|
+
void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
|
|
5402
|
+
if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
|
|
5403
|
+
min_log_number_to_keep_.store(number, std::memory_order_relaxed);
|
|
5192
5404
|
}
|
|
5193
5405
|
}
|
|
5194
5406
|
|
|
@@ -5268,28 +5480,33 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
5268
5480
|
VersionEdit edit;
|
|
5269
5481
|
edit.SetColumnFamily(cfd->GetID());
|
|
5270
5482
|
|
|
5271
|
-
|
|
5272
|
-
assert(
|
|
5483
|
+
const auto* current = cfd->current();
|
|
5484
|
+
assert(current);
|
|
5485
|
+
|
|
5486
|
+
const auto* vstorage = current->storage_info();
|
|
5487
|
+
assert(vstorage);
|
|
5273
5488
|
|
|
5274
5489
|
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
|
5275
|
-
|
|
5276
|
-
|
|
5277
|
-
|
|
5278
|
-
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
|
|
5282
|
-
|
|
5490
|
+
const auto& level_files = vstorage->LevelFiles(level);
|
|
5491
|
+
|
|
5492
|
+
for (const auto& f : level_files) {
|
|
5493
|
+
assert(f);
|
|
5494
|
+
|
|
5495
|
+
edit.AddFile(
|
|
5496
|
+
level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
|
|
5497
|
+
f->smallest, f->largest, f->fd.smallest_seqno,
|
|
5498
|
+
f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
|
|
5499
|
+
f->oldest_blob_file_number, f->oldest_ancester_time,
|
|
5500
|
+
f->file_creation_time, f->file_checksum,
|
|
5501
|
+
f->file_checksum_func_name, f->min_timestamp, f->max_timestamp);
|
|
5283
5502
|
}
|
|
5284
5503
|
}
|
|
5285
5504
|
|
|
5286
|
-
const auto& blob_files =
|
|
5287
|
-
for (const auto&
|
|
5288
|
-
const uint64_t blob_file_number = pair.first;
|
|
5289
|
-
const auto& meta = pair.second;
|
|
5290
|
-
|
|
5505
|
+
const auto& blob_files = vstorage->GetBlobFiles();
|
|
5506
|
+
for (const auto& meta : blob_files) {
|
|
5291
5507
|
assert(meta);
|
|
5292
|
-
|
|
5508
|
+
|
|
5509
|
+
const uint64_t blob_file_number = meta->GetBlobFileNumber();
|
|
5293
5510
|
|
|
5294
5511
|
edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
|
|
5295
5512
|
meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
|
|
@@ -5309,7 +5526,7 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
5309
5526
|
// min_log_number_to_keep is for the whole db, not for specific column family.
|
|
5310
5527
|
// So it does not need to be set for every column family, just need to be set once.
|
|
5311
5528
|
// Since default CF can never be dropped, we set the min_log to the default CF here.
|
|
5312
|
-
uint64_t min_log =
|
|
5529
|
+
uint64_t min_log = min_log_number_to_keep();
|
|
5313
5530
|
if (min_log != 0) {
|
|
5314
5531
|
edit.SetMinLogNumberToKeep(min_log);
|
|
5315
5532
|
}
|
|
@@ -5319,6 +5536,9 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
5319
5536
|
if (!full_history_ts_low.empty()) {
|
|
5320
5537
|
edit.SetFullHistoryTsLow(full_history_ts_low);
|
|
5321
5538
|
}
|
|
5539
|
+
|
|
5540
|
+
edit.SetLastSequence(descriptor_last_sequence_);
|
|
5541
|
+
|
|
5322
5542
|
std::string record;
|
|
5323
5543
|
if (!edit.EncodeTo(&record)) {
|
|
5324
5544
|
return Status::Corruption(
|
|
@@ -5489,7 +5709,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
|
|
|
5489
5709
|
if (table_cache != nullptr) {
|
|
5490
5710
|
result = table_cache->ApproximateOffsetOf(
|
|
5491
5711
|
key, f.file_metadata->fd, caller, icmp,
|
|
5492
|
-
v->GetMutableCFOptions().prefix_extractor
|
|
5712
|
+
v->GetMutableCFOptions().prefix_extractor);
|
|
5493
5713
|
}
|
|
5494
5714
|
}
|
|
5495
5715
|
return result;
|
|
@@ -5529,7 +5749,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
|
|
|
5529
5749
|
}
|
|
5530
5750
|
return table_cache->ApproximateSize(
|
|
5531
5751
|
start, end, f.file_metadata->fd, caller, icmp,
|
|
5532
|
-
v->GetMutableCFOptions().prefix_extractor
|
|
5752
|
+
v->GetMutableCFOptions().prefix_extractor);
|
|
5533
5753
|
}
|
|
5534
5754
|
|
|
5535
5755
|
void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
|
|
@@ -5603,7 +5823,9 @@ void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
|
|
|
5603
5823
|
InternalIterator* VersionSet::MakeInputIterator(
|
|
5604
5824
|
const ReadOptions& read_options, const Compaction* c,
|
|
5605
5825
|
RangeDelAggregator* range_del_agg,
|
|
5606
|
-
const FileOptions& file_options_compactions
|
|
5826
|
+
const FileOptions& file_options_compactions,
|
|
5827
|
+
const std::optional<const Slice>& start,
|
|
5828
|
+
const std::optional<const Slice>& end) {
|
|
5607
5829
|
auto cfd = c->column_family_data();
|
|
5608
5830
|
// Level-0 files have to be merged together. For other levels,
|
|
5609
5831
|
// we will make a concatenating iterator per level.
|
|
@@ -5618,10 +5840,25 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
5618
5840
|
if (c->level(which) == 0) {
|
|
5619
5841
|
const LevelFilesBrief* flevel = c->input_levels(which);
|
|
5620
5842
|
for (size_t i = 0; i < flevel->num_files; i++) {
|
|
5843
|
+
const FileMetaData& fmd = *flevel->files[i].file_metadata;
|
|
5844
|
+
if (start.has_value() &&
|
|
5845
|
+
cfd->user_comparator()->Compare(start.value(),
|
|
5846
|
+
fmd.largest.user_key()) > 0) {
|
|
5847
|
+
continue;
|
|
5848
|
+
}
|
|
5849
|
+
// We should be able to filter out the case where the end key
|
|
5850
|
+
// equals to the end boundary, since the end key is exclusive.
|
|
5851
|
+
// We try to be extra safe here.
|
|
5852
|
+
if (end.has_value() &&
|
|
5853
|
+
cfd->user_comparator()->Compare(end.value(),
|
|
5854
|
+
fmd.smallest.user_key()) < 0) {
|
|
5855
|
+
continue;
|
|
5856
|
+
}
|
|
5857
|
+
|
|
5621
5858
|
list[num++] = cfd->table_cache()->NewIterator(
|
|
5622
5859
|
read_options, file_options_compactions,
|
|
5623
|
-
cfd->internal_comparator(),
|
|
5624
|
-
|
|
5860
|
+
cfd->internal_comparator(), fmd, range_del_agg,
|
|
5861
|
+
c->mutable_cf_options()->prefix_extractor,
|
|
5625
5862
|
/*table_reader_ptr=*/nullptr,
|
|
5626
5863
|
/*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
|
|
5627
5864
|
/*arena=*/nullptr,
|
|
@@ -5637,7 +5874,7 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
5637
5874
|
list[num++] = new LevelIterator(
|
|
5638
5875
|
cfd->table_cache(), read_options, file_options_compactions,
|
|
5639
5876
|
cfd->internal_comparator(), c->input_levels(which),
|
|
5640
|
-
c->mutable_cf_options()->prefix_extractor
|
|
5877
|
+
c->mutable_cf_options()->prefix_extractor,
|
|
5641
5878
|
/*should_sample=*/false,
|
|
5642
5879
|
/*no per level latency histogram=*/nullptr,
|
|
5643
5880
|
TableReaderCaller::kCompaction, /*skip_filters=*/false,
|
|
@@ -5654,57 +5891,6 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
5654
5891
|
return result;
|
|
5655
5892
|
}
|
|
5656
5893
|
|
|
5657
|
-
// verify that the files listed in this compaction are present
|
|
5658
|
-
// in the current version
|
|
5659
|
-
bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
|
|
5660
|
-
#ifndef NDEBUG
|
|
5661
|
-
Version* version = c->column_family_data()->current();
|
|
5662
|
-
const VersionStorageInfo* vstorage = version->storage_info();
|
|
5663
|
-
if (c->input_version() != version) {
|
|
5664
|
-
ROCKS_LOG_INFO(
|
|
5665
|
-
db_options_->info_log,
|
|
5666
|
-
"[%s] compaction output being applied to a different base version from"
|
|
5667
|
-
" input version",
|
|
5668
|
-
c->column_family_data()->GetName().c_str());
|
|
5669
|
-
|
|
5670
|
-
if (vstorage->compaction_style_ == kCompactionStyleLevel &&
|
|
5671
|
-
c->start_level() == 0 && c->num_input_levels() > 2U) {
|
|
5672
|
-
// We are doing a L0->base_level compaction. The assumption is if
|
|
5673
|
-
// base level is not L1, levels from L1 to base_level - 1 is empty.
|
|
5674
|
-
// This is ensured by having one compaction from L0 going on at the
|
|
5675
|
-
// same time in level-based compaction. So that during the time, no
|
|
5676
|
-
// compaction/flush can put files to those levels.
|
|
5677
|
-
for (int l = c->start_level() + 1; l < c->output_level(); l++) {
|
|
5678
|
-
if (vstorage->NumLevelFiles(l) != 0) {
|
|
5679
|
-
return false;
|
|
5680
|
-
}
|
|
5681
|
-
}
|
|
5682
|
-
}
|
|
5683
|
-
}
|
|
5684
|
-
|
|
5685
|
-
for (size_t input = 0; input < c->num_input_levels(); ++input) {
|
|
5686
|
-
int level = c->level(input);
|
|
5687
|
-
for (size_t i = 0; i < c->num_input_files(input); ++i) {
|
|
5688
|
-
uint64_t number = c->input(input, i)->fd.GetNumber();
|
|
5689
|
-
bool found = false;
|
|
5690
|
-
for (size_t j = 0; j < vstorage->files_[level].size(); j++) {
|
|
5691
|
-
FileMetaData* f = vstorage->files_[level][j];
|
|
5692
|
-
if (f->fd.GetNumber() == number) {
|
|
5693
|
-
found = true;
|
|
5694
|
-
break;
|
|
5695
|
-
}
|
|
5696
|
-
}
|
|
5697
|
-
if (!found) {
|
|
5698
|
-
return false; // input files non existent in current version
|
|
5699
|
-
}
|
|
5700
|
-
}
|
|
5701
|
-
}
|
|
5702
|
-
#else
|
|
5703
|
-
(void)c;
|
|
5704
|
-
#endif
|
|
5705
|
-
return true; // everything good
|
|
5706
|
-
}
|
|
5707
|
-
|
|
5708
5894
|
Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
|
|
5709
5895
|
FileMetaData** meta,
|
|
5710
5896
|
ColumnFamilyData** cfd) {
|
|
@@ -5745,11 +5931,13 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
|
|
|
5745
5931
|
assert(!cfd->ioptions()->cf_paths.empty());
|
|
5746
5932
|
filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
|
|
5747
5933
|
}
|
|
5934
|
+
filemetadata.directory = filemetadata.db_path;
|
|
5748
5935
|
const uint64_t file_number = file->fd.GetNumber();
|
|
5749
5936
|
filemetadata.name = MakeTableFileName("", file_number);
|
|
5937
|
+
filemetadata.relative_filename = filemetadata.name.substr(1);
|
|
5750
5938
|
filemetadata.file_number = file_number;
|
|
5751
5939
|
filemetadata.level = level;
|
|
5752
|
-
filemetadata.size =
|
|
5940
|
+
filemetadata.size = file->fd.GetFileSize();
|
|
5753
5941
|
filemetadata.smallestkey = file->smallest.user_key().ToString();
|
|
5754
5942
|
filemetadata.largestkey = file->largest.user_key().ToString();
|
|
5755
5943
|
filemetadata.smallest_seqno = file->fd.smallest_seqno;
|
|
@@ -5762,6 +5950,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
|
|
|
5762
5950
|
filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
|
|
5763
5951
|
filemetadata.file_checksum = file->file_checksum;
|
|
5764
5952
|
filemetadata.file_checksum_func_name = file->file_checksum_func_name;
|
|
5953
|
+
filemetadata.temperature = file->temperature;
|
|
5954
|
+
filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
|
|
5955
|
+
filemetadata.file_creation_time = file->TryGetFileCreationTime();
|
|
5765
5956
|
metadata->push_back(filemetadata);
|
|
5766
5957
|
}
|
|
5767
5958
|
}
|
|
@@ -5820,9 +6011,10 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
|
|
|
5820
6011
|
*new_cfd->GetLatestMutableCFOptions(), io_tracer_,
|
|
5821
6012
|
current_version_number_++);
|
|
5822
6013
|
|
|
5823
|
-
|
|
5824
|
-
|
|
5825
|
-
|
|
6014
|
+
constexpr bool update_stats = false;
|
|
6015
|
+
|
|
6016
|
+
v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats);
|
|
6017
|
+
|
|
5826
6018
|
AppendVersion(new_cfd, v);
|
|
5827
6019
|
// GetLatestMutableCFOptions() is safe here without mutex since the
|
|
5828
6020
|
// cfd is not available to client
|
|
@@ -5858,6 +6050,34 @@ uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
|
|
|
5858
6050
|
return total_files_size;
|
|
5859
6051
|
}
|
|
5860
6052
|
|
|
6053
|
+
uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
|
|
6054
|
+
std::unordered_set<uint64_t> unique_blob_files;
|
|
6055
|
+
|
|
6056
|
+
uint64_t all_versions_blob_file_size = 0;
|
|
6057
|
+
|
|
6058
|
+
for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
|
|
6059
|
+
// iterate all the versions
|
|
6060
|
+
const auto* vstorage = v->storage_info();
|
|
6061
|
+
assert(vstorage);
|
|
6062
|
+
|
|
6063
|
+
const auto& blob_files = vstorage->GetBlobFiles();
|
|
6064
|
+
|
|
6065
|
+
for (const auto& meta : blob_files) {
|
|
6066
|
+
assert(meta);
|
|
6067
|
+
|
|
6068
|
+
const uint64_t blob_file_number = meta->GetBlobFileNumber();
|
|
6069
|
+
|
|
6070
|
+
if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) {
|
|
6071
|
+
// find Blob file that has not been counted
|
|
6072
|
+
unique_blob_files.insert(blob_file_number);
|
|
6073
|
+
all_versions_blob_file_size += meta->GetBlobFileSize();
|
|
6074
|
+
}
|
|
6075
|
+
}
|
|
6076
|
+
}
|
|
6077
|
+
|
|
6078
|
+
return all_versions_blob_file_size;
|
|
6079
|
+
}
|
|
6080
|
+
|
|
5861
6081
|
Status VersionSet::VerifyFileMetadata(const std::string& fpath,
|
|
5862
6082
|
const FileMetaData& meta) const {
|
|
5863
6083
|
uint64_t fsize = 0;
|
|
@@ -5877,8 +6097,8 @@ ReactiveVersionSet::ReactiveVersionSet(
|
|
|
5877
6097
|
const std::shared_ptr<IOTracer>& io_tracer)
|
|
5878
6098
|
: VersionSet(dbname, _db_options, _file_options, table_cache,
|
|
5879
6099
|
write_buffer_manager, write_controller,
|
|
5880
|
-
/*block_cache_tracer=*/nullptr, io_tracer
|
|
5881
|
-
|
|
6100
|
+
/*block_cache_tracer=*/nullptr, io_tracer,
|
|
6101
|
+
/*db_session_id*/ "") {}
|
|
5882
6102
|
|
|
5883
6103
|
ReactiveVersionSet::~ReactiveVersionSet() {}
|
|
5884
6104
|
|
|
@@ -5891,443 +6111,124 @@ Status ReactiveVersionSet::Recover(
|
|
|
5891
6111
|
assert(manifest_reporter != nullptr);
|
|
5892
6112
|
assert(manifest_reader_status != nullptr);
|
|
5893
6113
|
|
|
5894
|
-
std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
|
|
5895
|
-
for (const auto& cf : column_families) {
|
|
5896
|
-
cf_name_to_options.insert({cf.name, cf.options});
|
|
5897
|
-
}
|
|
5898
|
-
|
|
5899
|
-
// add default column family
|
|
5900
|
-
auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
|
|
5901
|
-
if (default_cf_iter == cf_name_to_options.end()) {
|
|
5902
|
-
return Status::InvalidArgument("Default column family not specified");
|
|
5903
|
-
}
|
|
5904
|
-
VersionEdit default_cf_edit;
|
|
5905
|
-
default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
|
|
5906
|
-
default_cf_edit.SetColumnFamily(0);
|
|
5907
|
-
ColumnFamilyData* default_cfd =
|
|
5908
|
-
CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
|
|
5909
|
-
// In recovery, nobody else can access it, so it's fine to set it to be
|
|
5910
|
-
// initialized earlier.
|
|
5911
|
-
default_cfd->set_initialized();
|
|
5912
|
-
VersionBuilderMap builders;
|
|
5913
|
-
std::unordered_map<int, std::string> column_families_not_found;
|
|
5914
|
-
builders.insert(
|
|
5915
|
-
std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
|
|
5916
|
-
new BaseReferencedVersionBuilder(default_cfd))));
|
|
5917
|
-
|
|
5918
6114
|
manifest_reader_status->reset(new Status());
|
|
5919
6115
|
manifest_reporter->reset(new LogReporter());
|
|
5920
6116
|
static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
|
|
5921
6117
|
manifest_reader_status->get();
|
|
5922
6118
|
Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
|
|
5923
|
-
|
|
5924
|
-
|
|
5925
|
-
int retry = 0;
|
|
5926
|
-
VersionEdit version_edit;
|
|
5927
|
-
while (s.ok() && retry < 1) {
|
|
5928
|
-
assert(reader != nullptr);
|
|
5929
|
-
s = ReadAndRecover(*reader, &read_buffer_, cf_name_to_options,
|
|
5930
|
-
column_families_not_found, builders,
|
|
5931
|
-
manifest_reader_status->get(), &version_edit);
|
|
5932
|
-
if (s.ok()) {
|
|
5933
|
-
bool enough = version_edit.has_next_file_number_ &&
|
|
5934
|
-
version_edit.has_log_number_ &&
|
|
5935
|
-
version_edit.has_last_sequence_;
|
|
5936
|
-
if (enough) {
|
|
5937
|
-
for (const auto& cf : column_families) {
|
|
5938
|
-
auto cfd = column_family_set_->GetColumnFamily(cf.name);
|
|
5939
|
-
if (cfd == nullptr) {
|
|
5940
|
-
enough = false;
|
|
5941
|
-
break;
|
|
5942
|
-
}
|
|
5943
|
-
}
|
|
5944
|
-
}
|
|
5945
|
-
if (enough) {
|
|
5946
|
-
for (const auto& cf : column_families) {
|
|
5947
|
-
auto cfd = column_family_set_->GetColumnFamily(cf.name);
|
|
5948
|
-
assert(cfd != nullptr);
|
|
5949
|
-
if (!cfd->IsDropped()) {
|
|
5950
|
-
auto builder_iter = builders.find(cfd->GetID());
|
|
5951
|
-
assert(builder_iter != builders.end());
|
|
5952
|
-
auto builder = builder_iter->second->version_builder();
|
|
5953
|
-
assert(builder != nullptr);
|
|
5954
|
-
s = builder->LoadTableHandlers(
|
|
5955
|
-
cfd->internal_stats(), db_options_->max_file_opening_threads,
|
|
5956
|
-
false /* prefetch_index_and_filter_in_cache */,
|
|
5957
|
-
true /* is_initial_load */,
|
|
5958
|
-
cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
|
|
5959
|
-
MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
|
|
5960
|
-
if (!s.ok()) {
|
|
5961
|
-
enough = false;
|
|
5962
|
-
if (s.IsPathNotFound()) {
|
|
5963
|
-
s = Status::OK();
|
|
5964
|
-
}
|
|
5965
|
-
break;
|
|
5966
|
-
}
|
|
5967
|
-
}
|
|
5968
|
-
}
|
|
5969
|
-
}
|
|
5970
|
-
if (enough) {
|
|
5971
|
-
break;
|
|
5972
|
-
}
|
|
5973
|
-
}
|
|
5974
|
-
++retry;
|
|
5975
|
-
}
|
|
5976
|
-
|
|
5977
|
-
if (s.ok()) {
|
|
5978
|
-
if (!version_edit.has_prev_log_number_) {
|
|
5979
|
-
version_edit.prev_log_number_ = 0;
|
|
5980
|
-
}
|
|
5981
|
-
column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
|
|
5982
|
-
|
|
5983
|
-
MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
|
|
5984
|
-
MarkFileNumberUsed(version_edit.prev_log_number_);
|
|
5985
|
-
MarkFileNumberUsed(version_edit.log_number_);
|
|
5986
|
-
|
|
5987
|
-
for (auto cfd : *column_family_set_) {
|
|
5988
|
-
assert(builders.count(cfd->GetID()) > 0);
|
|
5989
|
-
auto builder = builders[cfd->GetID()]->version_builder();
|
|
5990
|
-
if (!builder->CheckConsistencyForNumLevels()) {
|
|
5991
|
-
s = Status::InvalidArgument(
|
|
5992
|
-
"db has more levels than options.num_levels");
|
|
5993
|
-
break;
|
|
5994
|
-
}
|
|
5995
|
-
}
|
|
6119
|
+
if (!s.ok()) {
|
|
6120
|
+
return s;
|
|
5996
6121
|
}
|
|
6122
|
+
log::Reader* reader = manifest_reader->get();
|
|
6123
|
+
assert(reader);
|
|
5997
6124
|
|
|
5998
|
-
|
|
5999
|
-
|
|
6000
|
-
if (cfd->IsDropped()) {
|
|
6001
|
-
continue;
|
|
6002
|
-
}
|
|
6003
|
-
assert(cfd->initialized());
|
|
6004
|
-
auto builders_iter = builders.find(cfd->GetID());
|
|
6005
|
-
assert(builders_iter != builders.end());
|
|
6006
|
-
auto* builder = builders_iter->second->version_builder();
|
|
6125
|
+
manifest_tailer_.reset(new ManifestTailer(
|
|
6126
|
+
column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
|
|
6007
6127
|
|
|
6008
|
-
|
|
6009
|
-
*cfd->GetLatestMutableCFOptions(), io_tracer_,
|
|
6010
|
-
current_version_number_++);
|
|
6011
|
-
s = builder->SaveTo(v->storage_info());
|
|
6128
|
+
manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
|
|
6012
6129
|
|
|
6013
|
-
|
|
6014
|
-
// Install recovered version
|
|
6015
|
-
v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
|
|
6016
|
-
!(db_options_->skip_stats_update_on_db_open));
|
|
6017
|
-
AppendVersion(cfd, v);
|
|
6018
|
-
} else {
|
|
6019
|
-
ROCKS_LOG_ERROR(db_options_->info_log,
|
|
6020
|
-
"[%s]: inconsistent version: %s\n",
|
|
6021
|
-
cfd->GetName().c_str(), s.ToString().c_str());
|
|
6022
|
-
delete v;
|
|
6023
|
-
break;
|
|
6024
|
-
}
|
|
6025
|
-
}
|
|
6026
|
-
}
|
|
6027
|
-
if (s.ok()) {
|
|
6028
|
-
next_file_number_.store(version_edit.next_file_number_ + 1);
|
|
6029
|
-
last_allocated_sequence_ = version_edit.last_sequence_;
|
|
6030
|
-
last_published_sequence_ = version_edit.last_sequence_;
|
|
6031
|
-
last_sequence_ = version_edit.last_sequence_;
|
|
6032
|
-
prev_log_number_ = version_edit.prev_log_number_;
|
|
6033
|
-
for (auto cfd : *column_family_set_) {
|
|
6034
|
-
if (cfd->IsDropped()) {
|
|
6035
|
-
continue;
|
|
6036
|
-
}
|
|
6037
|
-
ROCKS_LOG_INFO(db_options_->info_log,
|
|
6038
|
-
"Column family [%s] (ID %u), log number is %" PRIu64 "\n",
|
|
6039
|
-
cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
|
|
6040
|
-
}
|
|
6041
|
-
}
|
|
6042
|
-
return s;
|
|
6130
|
+
return manifest_tailer_->status();
|
|
6043
6131
|
}
|
|
6044
6132
|
|
|
6045
6133
|
Status ReactiveVersionSet::ReadAndApply(
|
|
6046
6134
|
InstrumentedMutex* mu,
|
|
6047
6135
|
std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
|
|
6136
|
+
Status* manifest_read_status,
|
|
6048
6137
|
std::unordered_set<ColumnFamilyData*>* cfds_changed) {
|
|
6049
6138
|
assert(manifest_reader != nullptr);
|
|
6050
6139
|
assert(cfds_changed != nullptr);
|
|
6051
6140
|
mu->AssertHeld();
|
|
6052
6141
|
|
|
6053
6142
|
Status s;
|
|
6054
|
-
|
|
6055
|
-
|
|
6056
|
-
|
|
6057
|
-
|
|
6058
|
-
|
|
6059
|
-
std::string old_manifest_path = reader->file()->file_name();
|
|
6060
|
-
while (reader->ReadRecord(&record, &scratch)) {
|
|
6061
|
-
VersionEdit edit;
|
|
6062
|
-
s = edit.DecodeFrom(record);
|
|
6063
|
-
if (!s.ok()) {
|
|
6064
|
-
break;
|
|
6065
|
-
}
|
|
6066
|
-
|
|
6067
|
-
// Skip the first VersionEdits of each MANIFEST generated by
|
|
6068
|
-
// VersionSet::WriteCurrentStatetoManifest.
|
|
6069
|
-
if (number_of_edits_to_skip_ > 0) {
|
|
6070
|
-
ColumnFamilyData* cfd =
|
|
6071
|
-
column_family_set_->GetColumnFamily(edit.column_family_);
|
|
6072
|
-
if (cfd != nullptr && !cfd->IsDropped()) {
|
|
6073
|
-
--number_of_edits_to_skip_;
|
|
6074
|
-
}
|
|
6075
|
-
continue;
|
|
6076
|
-
}
|
|
6077
|
-
|
|
6078
|
-
s = read_buffer_.AddEdit(&edit);
|
|
6079
|
-
if (!s.ok()) {
|
|
6080
|
-
break;
|
|
6081
|
-
}
|
|
6082
|
-
VersionEdit temp_edit;
|
|
6083
|
-
if (edit.is_in_atomic_group_) {
|
|
6084
|
-
if (read_buffer_.IsFull()) {
|
|
6085
|
-
// Apply edits in an atomic group when we have read all edits in the
|
|
6086
|
-
// group.
|
|
6087
|
-
for (auto& e : read_buffer_.replay_buffer()) {
|
|
6088
|
-
s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
|
|
6089
|
-
if (!s.ok()) {
|
|
6090
|
-
break;
|
|
6091
|
-
}
|
|
6092
|
-
applied_edits++;
|
|
6093
|
-
}
|
|
6094
|
-
if (!s.ok()) {
|
|
6095
|
-
break;
|
|
6096
|
-
}
|
|
6097
|
-
read_buffer_.Clear();
|
|
6098
|
-
}
|
|
6099
|
-
} else {
|
|
6100
|
-
// Apply a normal edit immediately.
|
|
6101
|
-
s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
|
|
6102
|
-
if (s.ok()) {
|
|
6103
|
-
applied_edits++;
|
|
6104
|
-
} else {
|
|
6105
|
-
break;
|
|
6106
|
-
}
|
|
6107
|
-
}
|
|
6108
|
-
}
|
|
6109
|
-
if (!s.ok()) {
|
|
6110
|
-
// Clear the buffer if we fail to decode/apply an edit.
|
|
6111
|
-
read_buffer_.Clear();
|
|
6112
|
-
}
|
|
6113
|
-
// It's possible that:
|
|
6114
|
-
// 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
|
|
6115
|
-
// Or the version(s) rebuilt from tailing the MANIFEST is inconsistent.
|
|
6116
|
-
// 2) we have finished reading the current MANIFEST.
|
|
6117
|
-
// 3) we have encountered an IOError reading the current MANIFEST.
|
|
6118
|
-
// We need to look for the next MANIFEST and start from there. If we cannot
|
|
6119
|
-
// find the next MANIFEST, we should exit the loop.
|
|
6120
|
-
Status tmp_s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
|
|
6121
|
-
reader = manifest_reader->get();
|
|
6122
|
-
if (tmp_s.ok()) {
|
|
6123
|
-
if (reader->file()->file_name() == old_manifest_path) {
|
|
6124
|
-
// Still processing the same MANIFEST, thus no need to continue this
|
|
6125
|
-
// loop since no record is available if we have reached here.
|
|
6126
|
-
break;
|
|
6127
|
-
} else {
|
|
6128
|
-
// We have switched to a new MANIFEST whose first records have been
|
|
6129
|
-
// generated by VersionSet::WriteCurrentStatetoManifest. Since the
|
|
6130
|
-
// secondary instance has already finished recovering upon start, there
|
|
6131
|
-
// is no need for the secondary to process these records. Actually, if
|
|
6132
|
-
// the secondary were to replay these records, the secondary may end up
|
|
6133
|
-
// adding the same SST files AGAIN to each column family, causing
|
|
6134
|
-
// consistency checks done by VersionBuilder to fail. Therefore, we
|
|
6135
|
-
// record the number of records to skip at the beginning of the new
|
|
6136
|
-
// MANIFEST and ignore them.
|
|
6137
|
-
number_of_edits_to_skip_ = 0;
|
|
6138
|
-
for (auto* cfd : *column_family_set_) {
|
|
6139
|
-
if (cfd->IsDropped()) {
|
|
6140
|
-
continue;
|
|
6141
|
-
}
|
|
6142
|
-
// Increase number_of_edits_to_skip by 2 because
|
|
6143
|
-
// WriteCurrentStatetoManifest() writes 2 version edits for each
|
|
6144
|
-
// column family at the beginning of the newly-generated MANIFEST.
|
|
6145
|
-
// TODO(yanqin) remove hard-coded value.
|
|
6146
|
-
if (db_options_->write_dbid_to_manifest) {
|
|
6147
|
-
number_of_edits_to_skip_ += 3;
|
|
6148
|
-
} else {
|
|
6149
|
-
number_of_edits_to_skip_ += 2;
|
|
6150
|
-
}
|
|
6151
|
-
}
|
|
6152
|
-
s = tmp_s;
|
|
6153
|
-
}
|
|
6154
|
-
}
|
|
6143
|
+
log::Reader* reader = manifest_reader->get();
|
|
6144
|
+
assert(reader);
|
|
6145
|
+
s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
|
|
6146
|
+
if (!s.ok()) {
|
|
6147
|
+
return s;
|
|
6155
6148
|
}
|
|
6156
|
-
|
|
6149
|
+
manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
|
|
6150
|
+
s = manifest_tailer_->status();
|
|
6157
6151
|
if (s.ok()) {
|
|
6158
|
-
|
|
6159
|
-
auto builder_iter = active_version_builders_.find(cfd->GetID());
|
|
6160
|
-
if (builder_iter == active_version_builders_.end()) {
|
|
6161
|
-
continue;
|
|
6162
|
-
}
|
|
6163
|
-
auto builder = builder_iter->second->version_builder();
|
|
6164
|
-
if (!builder->CheckConsistencyForNumLevels()) {
|
|
6165
|
-
s = Status::InvalidArgument(
|
|
6166
|
-
"db has more levels than options.num_levels");
|
|
6167
|
-
break;
|
|
6168
|
-
}
|
|
6169
|
-
}
|
|
6152
|
+
*cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
|
|
6170
6153
|
}
|
|
6171
|
-
|
|
6172
|
-
&applied_edits);
|
|
6154
|
+
|
|
6173
6155
|
return s;
|
|
6174
6156
|
}
|
|
6175
6157
|
|
|
6176
|
-
Status ReactiveVersionSet::
|
|
6177
|
-
|
|
6178
|
-
|
|
6179
|
-
|
|
6180
|
-
|
|
6181
|
-
|
|
6182
|
-
|
|
6183
|
-
|
|
6184
|
-
|
|
6185
|
-
|
|
6186
|
-
if (nullptr == cfd) {
|
|
6187
|
-
return Status::OK();
|
|
6188
|
-
}
|
|
6189
|
-
if (active_version_builders_.find(edit.column_family_) ==
|
|
6190
|
-
active_version_builders_.end() &&
|
|
6191
|
-
!cfd->IsDropped()) {
|
|
6192
|
-
std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
|
|
6193
|
-
new BaseReferencedVersionBuilder(cfd));
|
|
6194
|
-
active_version_builders_.insert(
|
|
6195
|
-
std::make_pair(edit.column_family_, std::move(builder_guard)));
|
|
6158
|
+
Status ReactiveVersionSet::MaybeSwitchManifest(
|
|
6159
|
+
log::Reader::Reporter* reporter,
|
|
6160
|
+
std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
|
|
6161
|
+
assert(manifest_reader != nullptr);
|
|
6162
|
+
Status s;
|
|
6163
|
+
std::string manifest_path;
|
|
6164
|
+
s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
|
|
6165
|
+
&manifest_file_number_);
|
|
6166
|
+
if (!s.ok()) {
|
|
6167
|
+
return s;
|
|
6196
6168
|
}
|
|
6197
|
-
|
|
6198
|
-
|
|
6199
|
-
|
|
6200
|
-
|
|
6201
|
-
assert(builder != nullptr);
|
|
6202
|
-
|
|
6203
|
-
if (edit.is_column_family_add_) {
|
|
6204
|
-
// TODO (yanqin) for now the secondary ignores column families created
|
|
6205
|
-
// after Open. This also simplifies handling of switching to a new MANIFEST
|
|
6206
|
-
// and processing the snapshot of the system at the beginning of the
|
|
6169
|
+
std::unique_ptr<FSSequentialFile> manifest_file;
|
|
6170
|
+
if (manifest_reader->get() != nullptr &&
|
|
6171
|
+
manifest_reader->get()->file()->file_name() == manifest_path) {
|
|
6172
|
+
// CURRENT points to the same MANIFEST as before, no need to switch
|
|
6207
6173
|
// MANIFEST.
|
|
6208
|
-
} else if (edit.is_column_family_drop_) {
|
|
6209
|
-
// Drop the column family by setting it to be 'dropped' without destroying
|
|
6210
|
-
// the column family handle.
|
|
6211
|
-
// TODO (haoyu) figure out how to handle column faimly drop for
|
|
6212
|
-
// secondary instance. (Is it possible that the ref count for cfd is 0 but
|
|
6213
|
-
// the ref count for its versions is higher than 0?)
|
|
6214
|
-
cfd->SetDropped();
|
|
6215
|
-
if (cfd->UnrefAndTryDelete()) {
|
|
6216
|
-
cfd = nullptr;
|
|
6217
|
-
}
|
|
6218
|
-
active_version_builders_.erase(builder_iter);
|
|
6219
|
-
} else {
|
|
6220
|
-
Status s = builder->Apply(&edit);
|
|
6221
|
-
if (!s.ok()) {
|
|
6222
|
-
return s;
|
|
6223
|
-
}
|
|
6224
|
-
}
|
|
6225
|
-
Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
|
|
6226
|
-
if (!s.ok()) {
|
|
6227
6174
|
return s;
|
|
6228
6175
|
}
|
|
6229
|
-
|
|
6230
|
-
|
|
6231
|
-
|
|
6232
|
-
|
|
6233
|
-
|
|
6234
|
-
|
|
6235
|
-
|
|
6236
|
-
|
|
6237
|
-
|
|
6238
|
-
"ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
|
|
6239
|
-
"AfterLoadTableHandlers",
|
|
6240
|
-
&s);
|
|
6241
|
-
|
|
6242
|
-
if (s.ok()) {
|
|
6243
|
-
auto version = new Version(cfd, this, file_options_,
|
|
6244
|
-
*cfd->GetLatestMutableCFOptions(), io_tracer_,
|
|
6245
|
-
current_version_number_++);
|
|
6246
|
-
s = builder->SaveTo(version->storage_info());
|
|
6247
|
-
if (s.ok()) {
|
|
6248
|
-
version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
|
|
6249
|
-
AppendVersion(cfd, version);
|
|
6250
|
-
active_version_builders_.erase(builder_iter);
|
|
6251
|
-
if (cfds_changed->count(cfd) == 0) {
|
|
6252
|
-
cfds_changed->insert(cfd);
|
|
6253
|
-
}
|
|
6254
|
-
} else {
|
|
6255
|
-
delete version;
|
|
6256
|
-
}
|
|
6257
|
-
} else if (s.IsPathNotFound()) {
|
|
6258
|
-
s = Status::OK();
|
|
6259
|
-
}
|
|
6260
|
-
// Some other error has occurred during LoadTableHandlers.
|
|
6176
|
+
assert(nullptr == manifest_reader->get() ||
|
|
6177
|
+
manifest_reader->get()->file()->file_name() != manifest_path);
|
|
6178
|
+
s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
|
|
6179
|
+
if (s.IsNotFound()) {
|
|
6180
|
+
return Status::TryAgain(
|
|
6181
|
+
"The primary may have switched to a new MANIFEST and deleted the old "
|
|
6182
|
+
"one.");
|
|
6183
|
+
} else if (!s.ok()) {
|
|
6184
|
+
return s;
|
|
6261
6185
|
}
|
|
6262
|
-
|
|
6186
|
+
TEST_SYNC_POINT(
|
|
6187
|
+
"ReactiveVersionSet::MaybeSwitchManifest:"
|
|
6188
|
+
"AfterGetCurrentManifestPath:0");
|
|
6189
|
+
TEST_SYNC_POINT(
|
|
6190
|
+
"ReactiveVersionSet::MaybeSwitchManifest:"
|
|
6191
|
+
"AfterGetCurrentManifestPath:1");
|
|
6192
|
+
// The primary can also delete the MANIFEST while the secondary is reading
|
|
6193
|
+
// it. This is OK on POSIX. For other file systems, maybe create a hard link
|
|
6194
|
+
// to MANIFEST. The hard link should be cleaned up later by the secondary.
|
|
6195
|
+
s = fs_->NewSequentialFile(manifest_path,
|
|
6196
|
+
fs_->OptimizeForManifestRead(file_options_),
|
|
6197
|
+
&manifest_file, nullptr);
|
|
6198
|
+
std::unique_ptr<SequentialFileReader> manifest_file_reader;
|
|
6263
6199
|
if (s.ok()) {
|
|
6264
|
-
|
|
6265
|
-
|
|
6266
|
-
|
|
6267
|
-
|
|
6268
|
-
|
|
6269
|
-
|
|
6270
|
-
|
|
6271
|
-
|
|
6272
|
-
if (
|
|
6273
|
-
|
|
6274
|
-
|
|
6275
|
-
|
|
6276
|
-
if
|
|
6277
|
-
|
|
6278
|
-
|
|
6279
|
-
|
|
6280
|
-
|
|
6200
|
+
manifest_file_reader.reset(new SequentialFileReader(
|
|
6201
|
+
std::move(manifest_file), manifest_path,
|
|
6202
|
+
db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
|
|
6203
|
+
manifest_reader->reset(new log::FragmentBufferedReader(
|
|
6204
|
+
nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
|
|
6205
|
+
0 /* log_number */));
|
|
6206
|
+
ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
|
|
6207
|
+
manifest_path.c_str());
|
|
6208
|
+
if (manifest_tailer_) {
|
|
6209
|
+
manifest_tailer_->PrepareToReadNewManifest();
|
|
6210
|
+
}
|
|
6211
|
+
} else if (s.IsPathNotFound()) {
|
|
6212
|
+
// This can happen if the primary switches to a new MANIFEST after the
|
|
6213
|
+
// secondary reads the CURRENT file but before the secondary actually tries
|
|
6214
|
+
// to open the MANIFEST.
|
|
6215
|
+
s = Status::TryAgain(
|
|
6216
|
+
"The primary may have switched to a new MANIFEST and deleted the old "
|
|
6217
|
+
"one.");
|
|
6281
6218
|
}
|
|
6282
6219
|
return s;
|
|
6283
6220
|
}
|
|
6284
6221
|
|
|
6285
|
-
|
|
6286
|
-
|
|
6287
|
-
|
|
6288
|
-
|
|
6289
|
-
|
|
6290
|
-
|
|
6291
|
-
|
|
6292
|
-
|
|
6293
|
-
|
|
6294
|
-
|
|
6295
|
-
if (s.ok()) {
|
|
6296
|
-
if (nullptr == manifest_reader->get() ||
|
|
6297
|
-
manifest_reader->get()->file()->file_name() != manifest_path) {
|
|
6298
|
-
TEST_SYNC_POINT(
|
|
6299
|
-
"ReactiveVersionSet::MaybeSwitchManifest:"
|
|
6300
|
-
"AfterGetCurrentManifestPath:0");
|
|
6301
|
-
TEST_SYNC_POINT(
|
|
6302
|
-
"ReactiveVersionSet::MaybeSwitchManifest:"
|
|
6303
|
-
"AfterGetCurrentManifestPath:1");
|
|
6304
|
-
s = fs_->NewSequentialFile(manifest_path,
|
|
6305
|
-
env_->OptimizeForManifestRead(file_options_),
|
|
6306
|
-
&manifest_file, nullptr);
|
|
6307
|
-
} else {
|
|
6308
|
-
// No need to switch manifest.
|
|
6309
|
-
break;
|
|
6310
|
-
}
|
|
6311
|
-
}
|
|
6312
|
-
std::unique_ptr<SequentialFileReader> manifest_file_reader;
|
|
6313
|
-
if (s.ok()) {
|
|
6314
|
-
manifest_file_reader.reset(new SequentialFileReader(
|
|
6315
|
-
std::move(manifest_file), manifest_path,
|
|
6316
|
-
db_options_->log_readahead_size, io_tracer_));
|
|
6317
|
-
manifest_reader->reset(new log::FragmentBufferedReader(
|
|
6318
|
-
nullptr, std::move(manifest_file_reader), reporter,
|
|
6319
|
-
true /* checksum */, 0 /* log_number */));
|
|
6320
|
-
ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
|
|
6321
|
-
manifest_path.c_str());
|
|
6322
|
-
// TODO (yanqin) every time we switch to a new MANIFEST, we clear the
|
|
6323
|
-
// active_version_builders_ map because we choose to construct the
|
|
6324
|
-
// versions from scratch, thanks to the first part of each MANIFEST
|
|
6325
|
-
// written by VersionSet::WriteCurrentStatetoManifest. This is not
|
|
6326
|
-
// necessary, but we choose this at present for the sake of simplicity.
|
|
6327
|
-
active_version_builders_.clear();
|
|
6328
|
-
}
|
|
6329
|
-
} while (s.IsPathNotFound());
|
|
6330
|
-
return s;
|
|
6222
|
+
#ifndef NDEBUG
|
|
6223
|
+
uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
|
|
6224
|
+
assert(manifest_tailer_);
|
|
6225
|
+
return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
|
|
6226
|
+
}
|
|
6227
|
+
#endif // !NDEBUG
|
|
6228
|
+
|
|
6229
|
+
std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
|
|
6230
|
+
assert(manifest_tailer_);
|
|
6231
|
+
return manifest_tailer_->GetReadBuffer().replay_buffer();
|
|
6331
6232
|
}
|
|
6332
6233
|
|
|
6333
6234
|
} // namespace ROCKSDB_NAMESPACE
|