@nxtedition/rocksdb 13.5.7 → 13.5.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +248 -70
- package/binding.gyp +2 -2
- package/deps/rocksdb/rocksdb/BUCK +12 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -0
- package/deps/rocksdb/rocksdb/Makefile +28 -23
- package/deps/rocksdb/rocksdb/cache/cache.cc +0 -1
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +43 -39
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +0 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -3
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +1 -3
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +11 -1
- package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +13 -5
- package/deps/rocksdb/rocksdb/crash_test.mk +61 -15
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +136 -45
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +34 -16
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +10 -7
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +1 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +12 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +2 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/builder.cc +22 -8
- package/deps/rocksdb/rocksdb/db/builder.h +5 -4
- package/deps/rocksdb/rocksdb/db/c.cc +556 -15
- package/deps/rocksdb/rocksdb/db/c_test.c +133 -12
- package/deps/rocksdb/rocksdb/db/column_family.cc +114 -50
- package/deps/rocksdb/rocksdb/db/column_family.h +53 -36
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +95 -70
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +71 -51
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -86
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +26 -68
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +0 -122
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +453 -258
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +117 -92
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +38 -38
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +24 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +34 -45
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -31
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +10 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +82 -34
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +267 -179
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +4 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +273 -89
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +300 -14
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +4 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +28 -23
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +69 -51
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +522 -245
- package/deps/rocksdb/rocksdb/db/convenience.cc +15 -4
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +196 -17
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +74 -62
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +48 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +682 -250
- package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +11 -16
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +57 -0
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +540 -490
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +347 -188
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +584 -217
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +13 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -7
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +40 -36
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +751 -372
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +35 -32
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +24 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +125 -63
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +311 -196
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +15 -5
- package/deps/rocksdb/rocksdb/db/db_iter.cc +42 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +96 -31
- package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/db_iter_test.cc +168 -228
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +454 -0
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +8 -8
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +90 -0
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +60 -2
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +7 -3
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +85 -27
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +114 -2
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +51 -3
- package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_test.cc +325 -18
- package/deps/rocksdb/rocksdb/db/db_test2.cc +644 -20
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
- package/deps/rocksdb/rocksdb/db/db_test_util.h +9 -0
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +64 -45
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +203 -14
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +259 -30
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +75 -1
- package/deps/rocksdb/rocksdb/db/dbformat.h +70 -6
- package/deps/rocksdb/rocksdb/db/deletefile_test.cc +0 -190
- package/deps/rocksdb/rocksdb/db/error_handler.cc +22 -7
- package/deps/rocksdb/rocksdb/db/error_handler.h +16 -1
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +41 -26
- package/deps/rocksdb/rocksdb/db/experimental.cc +4 -3
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +464 -78
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +166 -69
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +54 -25
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/flush_job.cc +98 -81
- package/deps/rocksdb/rocksdb/db/flush_job.h +4 -9
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +80 -84
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +1 -1
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +2 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +12 -19
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +41 -15
- package/deps/rocksdb/rocksdb/db/internal_stats.h +63 -52
- package/deps/rocksdb/rocksdb/db/job_context.h +59 -24
- package/deps/rocksdb/rocksdb/db/listener_test.cc +69 -10
- package/deps/rocksdb/rocksdb/db/log_format.h +11 -2
- package/deps/rocksdb/rocksdb/db/log_reader.cc +147 -34
- package/deps/rocksdb/rocksdb/db/log_reader.h +40 -11
- package/deps/rocksdb/rocksdb/db/log_test.cc +16 -3
- package/deps/rocksdb/rocksdb/db/log_writer.cc +102 -55
- package/deps/rocksdb/rocksdb/db/log_writer.h +21 -2
- package/deps/rocksdb/rocksdb/db/malloc_stats.h +0 -2
- package/deps/rocksdb/rocksdb/db/memtable.cc +16 -47
- package/deps/rocksdb/rocksdb/db/memtable.h +76 -12
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +23 -20
- package/deps/rocksdb/rocksdb/db/memtable_list.h +9 -11
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +18 -37
- package/deps/rocksdb/rocksdb/db/merge_context.h +2 -1
- package/deps/rocksdb/rocksdb/db/merge_test.cc +8 -0
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +3 -5
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +15 -7
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +6 -3
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +22 -4
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +41 -1
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/repair.cc +29 -34
- package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +14 -15
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +1 -3
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +47 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +1 -3
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_builder.cc +2 -2
- package/deps/rocksdb/rocksdb/db/version_edit.cc +8 -37
- package/deps/rocksdb/rocksdb/db/version_edit.h +32 -1
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +26 -18
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +282 -197
- package/deps/rocksdb/rocksdb/db/version_set.h +54 -57
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +28 -35
- package/deps/rocksdb/rocksdb/db/version_util.h +2 -3
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +3 -2
- package/deps/rocksdb/rocksdb/db/wal_manager.h +0 -1
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +1 -0
- package/deps/rocksdb/rocksdb/db/write_batch.cc +22 -8
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +5 -4
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +7 -6
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/write_thread.h +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +13 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +9 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +39 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +65 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +45 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +22 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +28 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -38
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +80 -32
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +51 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +23 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +305 -15
- package/deps/rocksdb/rocksdb/env/env.cc +32 -2
- package/deps/rocksdb/rocksdb/env/env_encryption.cc +0 -2
- package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +2 -4
- package/deps/rocksdb/rocksdb/env/env_posix.cc +4 -2
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -11
- package/deps/rocksdb/rocksdb/env/fs_readonly.h +0 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +0 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.h +0 -2
- package/deps/rocksdb/rocksdb/env/io_posix.cc +6 -4
- package/deps/rocksdb/rocksdb/env/io_posix.h +3 -2
- package/deps/rocksdb/rocksdb/env/mock_env.cc +0 -1
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +2 -2
- package/deps/rocksdb/rocksdb/file/delete_scheduler.h +0 -2
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +0 -2
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +30 -21
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +16 -0
- package/deps/rocksdb/rocksdb/file/file_util.cc +32 -14
- package/deps/rocksdb/rocksdb/file/file_util.h +22 -5
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +229 -76
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +21 -12
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +10 -7
- package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +12 -8
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +1 -2
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +0 -2
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +598 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_iterator.h +36 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +70 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +232 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +149 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +17 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +132 -34
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +158 -79
- package/deps/rocksdb/rocksdb/include/rocksdb/db_bench_tool.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +1 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +275 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +50 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +10 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +5 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +237 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +230 -39
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_level.h +31 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +41 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +19 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +124 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +26 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +55 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +3 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +96 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index_faiss.h +117 -0
- package/deps/rocksdb/rocksdb/{utilities/secondary_index/faiss_ivf_index.h → include/rocksdb/utilities/secondary_index_simple.h} +11 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +26 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +16 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +63 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +28 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +3 -3
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +0 -2
- package/deps/rocksdb/rocksdb/logging/event_logger_test.cc +1 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +1 -1
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +0 -1
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +0 -1
- package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +3 -1
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +2 -2
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +2 -4
- package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +69 -8
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +32 -9
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +58 -45
- package/deps/rocksdb/rocksdb/monitoring/histogram.h +1 -1
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -3
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +1 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +3 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +44 -13
- package/deps/rocksdb/rocksdb/options/cf_options.h +21 -7
- package/deps/rocksdb/rocksdb/options/configurable.cc +5 -5
- package/deps/rocksdb/rocksdb/options/configurable_test.h +1 -2
- package/deps/rocksdb/rocksdb/options/customizable.cc +0 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -11
- package/deps/rocksdb/rocksdb/options/db_options.cc +18 -15
- package/deps/rocksdb/rocksdb/options/db_options.h +2 -2
- package/deps/rocksdb/rocksdb/options/options.cc +296 -305
- package/deps/rocksdb/rocksdb/options/options_helper.cc +188 -62
- package/deps/rocksdb/rocksdb/options/options_helper.h +3 -3
- package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -4
- package/deps/rocksdb/rocksdb/options/options_parser.h +0 -1
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +17 -4
- package/deps/rocksdb/rocksdb/options/options_test.cc +101 -76
- package/deps/rocksdb/rocksdb/port/lang.h +2 -1
- package/deps/rocksdb/rocksdb/port/port_posix.cc +2 -1
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +5 -4
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +3 -2
- package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +99 -1
- package/deps/rocksdb/rocksdb/port/win/xpress_win.h +6 -0
- package/deps/rocksdb/rocksdb/src.mk +17 -11
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1094 -929
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +6 -19
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +76 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +221 -131
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +12 -9
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +23 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -38
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +10 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +35 -43
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -4
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -5
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +4 -4
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +37 -35
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +11 -7
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +4 -3
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +31 -5
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +0 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +0 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +0 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +483 -0
- package/deps/rocksdb/rocksdb/table/format.cc +62 -44
- package/deps/rocksdb/rocksdb/table/format.h +35 -12
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +3 -13
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +6 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +150 -141
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +5 -0
- package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +8 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +0 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +0 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +0 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +6 -6
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +86 -7
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +88 -2
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +0 -1
- package/deps/rocksdb/rocksdb/table/table_builder.h +10 -1
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +3 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +899 -22
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +3 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.h +132 -1
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +0 -1
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.h +0 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +163 -77
- package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +0 -2
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +120 -52
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +0 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +0 -2
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +94 -0
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +0 -1
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +1 -1
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +2 -1
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +3 -5
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +15 -8
- package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +131 -0
- package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +90 -0
- package/deps/rocksdb/rocksdb/util/autovector.h +1 -1
- package/deps/rocksdb/rocksdb/util/autovector_test.cc +2 -2
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +0 -2
- package/deps/rocksdb/rocksdb/util/compression.cc +936 -4
- package/deps/rocksdb/rocksdb/util/compression.h +348 -232
- package/deps/rocksdb/rocksdb/util/compression_test.cc +229 -0
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +10 -10
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.c +1 -0
- package/deps/rocksdb/rocksdb/util/data_structure.cc +2 -0
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -3
- package/deps/rocksdb/rocksdb/util/ppc-opcode.h +5 -5
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +108 -0
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +67 -0
- package/deps/rocksdb/rocksdb/util/slice_test.cc +83 -0
- package/deps/rocksdb/rocksdb/util/string_util.cc +0 -2
- package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
- package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
- package/deps/rocksdb/rocksdb/util/udt_util.cc +18 -5
- package/deps/rocksdb/rocksdb/util/udt_util.h +10 -7
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +650 -154
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +438 -144
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +16 -17
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +2 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +7 -8
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +0 -48
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/debug.cc +7 -14
- package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/env_mirror_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/env_timed.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/env_timed_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +10 -9
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/lrulist.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +183 -32
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +258 -12
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_helper.h +33 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_iterator.cc +99 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +280 -120
- package/deps/rocksdb/rocksdb/utilities/secondary_index/simple_secondary_index.cc +79 -0
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +52 -16
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h +10 -6
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +55 -0
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +37 -12
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +1 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +36 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +4 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +1 -4
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1118 -37
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +4 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +0 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +125 -127
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +45 -23
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +54 -22
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +477 -58
- package/deps/rocksdb/rocksdb.gyp +9 -4
- package/index.js +50 -9
- package/package.json +8 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -35,8 +35,8 @@ Options SanitizeOptions(const std::string& dbname, const Options& src,
|
|
|
35
35
|
auto db_options =
|
|
36
36
|
SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
|
|
37
37
|
ImmutableDBOptions immutable_db_options(db_options);
|
|
38
|
-
auto cf_options =
|
|
39
|
-
|
|
38
|
+
auto cf_options = SanitizeCfOptions(immutable_db_options, read_only,
|
|
39
|
+
ColumnFamilyOptions(src));
|
|
40
40
|
return Options(db_options, cf_options);
|
|
41
41
|
}
|
|
42
42
|
|
|
@@ -224,6 +224,12 @@ Status DBImpl::ValidateOptions(
|
|
|
224
224
|
if (!s.ok()) {
|
|
225
225
|
return s;
|
|
226
226
|
}
|
|
227
|
+
if (cfd.name == kDefaultColumnFamilyName) {
|
|
228
|
+
if (cfd.options.disallow_memtable_writes) {
|
|
229
|
+
return Status::InvalidArgument(
|
|
230
|
+
"Default column family cannot use disallow_memtable_writes=true");
|
|
231
|
+
}
|
|
232
|
+
}
|
|
227
233
|
}
|
|
228
234
|
s = ValidateOptions(db_options);
|
|
229
235
|
return s;
|
|
@@ -575,7 +581,7 @@ Status DBImpl::Recover(
|
|
|
575
581
|
}
|
|
576
582
|
if (s.ok() && !read_only) {
|
|
577
583
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
578
|
-
auto& moptions =
|
|
584
|
+
const auto& moptions = cfd->GetLatestMutableCFOptions();
|
|
579
585
|
// Try to trivially move files down the LSM tree to start from bottommost
|
|
580
586
|
// level when level_compaction_dynamic_level_bytes is enabled. This should
|
|
581
587
|
// only be useful when user is migrating to turning on this option.
|
|
@@ -590,16 +596,16 @@ Status DBImpl::Recover(
|
|
|
590
596
|
// the user wants to partition SST files.
|
|
591
597
|
// Note that files moved in this step may not respect the compression
|
|
592
598
|
// option in target level.
|
|
593
|
-
if (cfd->ioptions()
|
|
599
|
+
if (cfd->ioptions().compaction_style ==
|
|
594
600
|
CompactionStyle::kCompactionStyleLevel &&
|
|
595
|
-
cfd->ioptions()
|
|
601
|
+
cfd->ioptions().level_compaction_dynamic_level_bytes &&
|
|
596
602
|
!moptions.disable_auto_compactions) {
|
|
597
|
-
int to_level = cfd->ioptions()
|
|
603
|
+
int to_level = cfd->ioptions().num_levels - 1;
|
|
598
604
|
// last level is reserved
|
|
599
605
|
// allow_ingest_behind does not support Level Compaction,
|
|
600
606
|
// and per_key_placement can have infinite compaction loop for Level
|
|
601
607
|
// Compaction. Adjust to_level here just to be safe.
|
|
602
|
-
if (cfd->ioptions()
|
|
608
|
+
if (cfd->ioptions().allow_ingest_behind ||
|
|
603
609
|
moptions.preclude_last_level_data_seconds > 0) {
|
|
604
610
|
to_level -= 1;
|
|
605
611
|
}
|
|
@@ -622,10 +628,10 @@ Status DBImpl::Recover(
|
|
|
622
628
|
// lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with
|
|
623
629
|
// 7 levels
|
|
624
630
|
std::string lsm_state = "[";
|
|
625
|
-
for (int i = 0; i < cfd->ioptions()
|
|
631
|
+
for (int i = 0; i < cfd->ioptions().num_levels; ++i) {
|
|
626
632
|
lsm_state += std::to_string(
|
|
627
633
|
cfd->current()->storage_info()->NumLevelFiles(i));
|
|
628
|
-
if (i < cfd->ioptions()
|
|
634
|
+
if (i < cfd->ioptions().num_levels - 1) {
|
|
629
635
|
lsm_state += ",";
|
|
630
636
|
}
|
|
631
637
|
}
|
|
@@ -708,9 +714,9 @@ Status DBImpl::Recover(
|
|
|
708
714
|
// may check this value to decide whether to flush.
|
|
709
715
|
max_total_in_memory_state_ = 0;
|
|
710
716
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
711
|
-
auto
|
|
712
|
-
max_total_in_memory_state_ += mutable_cf_options
|
|
713
|
-
mutable_cf_options
|
|
717
|
+
const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
718
|
+
max_total_in_memory_state_ += mutable_cf_options.write_buffer_size *
|
|
719
|
+
mutable_cf_options.max_write_buffer_number;
|
|
714
720
|
}
|
|
715
721
|
|
|
716
722
|
SequenceNumber next_sequence(kMaxSequenceNumber);
|
|
@@ -754,6 +760,11 @@ Status DBImpl::Recover(
|
|
|
754
760
|
}
|
|
755
761
|
}
|
|
756
762
|
|
|
763
|
+
if (immutable_db_options_.track_and_verify_wals && !is_new_db &&
|
|
764
|
+
!immutable_db_options_.best_efforts_recovery && wal_files.empty()) {
|
|
765
|
+
return Status::Corruption("Opening an existing DB with no WAL files");
|
|
766
|
+
}
|
|
767
|
+
|
|
757
768
|
if (immutable_db_options_.track_and_verify_wals_in_manifest) {
|
|
758
769
|
if (!immutable_db_options_.best_efforts_recovery) {
|
|
759
770
|
// Verify WALs in MANIFEST.
|
|
@@ -816,8 +827,7 @@ Status DBImpl::Recover(
|
|
|
816
827
|
if (!s.ok()) {
|
|
817
828
|
// Clear memtables if recovery failed
|
|
818
829
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
819
|
-
cfd->CreateNewMemtable(
|
|
820
|
-
kMaxSequenceNumber);
|
|
830
|
+
cfd->CreateNewMemtable(kMaxSequenceNumber);
|
|
821
831
|
}
|
|
822
832
|
}
|
|
823
833
|
}
|
|
@@ -983,8 +993,7 @@ Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
|
|
|
983
993
|
const ReadOptions read_options(Env::IOActivity::kDBOpen);
|
|
984
994
|
const WriteOptions write_options(Env::IOActivity::kDBOpen);
|
|
985
995
|
|
|
986
|
-
Status s = versions_->LogAndApply(recovery_ctx.cfds_,
|
|
987
|
-
recovery_ctx.mutable_cf_opts_, read_options,
|
|
996
|
+
Status s = versions_->LogAndApply(recovery_ctx.cfds_, read_options,
|
|
988
997
|
write_options, recovery_ctx.edit_lists_,
|
|
989
998
|
&mutex_, directories_.GetDbDir());
|
|
990
999
|
return s;
|
|
@@ -1103,50 +1112,64 @@ bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
|
|
|
1103
1112
|
return true;
|
|
1104
1113
|
}
|
|
1105
1114
|
|
|
1115
|
+
void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s,
|
|
1116
|
+
uint64_t log_number) {
|
|
1117
|
+
ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
|
|
1118
|
+
(status == nullptr ? "(ignoring error) " : ""), fname,
|
|
1119
|
+
static_cast<int>(bytes), s.ToString().c_str());
|
|
1120
|
+
if (status != nullptr && status->ok()) {
|
|
1121
|
+
*status = s;
|
|
1122
|
+
corrupted_wal_number_ = log_number;
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
void DBOpenLogRecordReadReporter::OldLogRecord(size_t bytes) {
|
|
1127
|
+
if (old_log_record != nullptr) {
|
|
1128
|
+
*old_log_record = true;
|
|
1129
|
+
}
|
|
1130
|
+
ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled", fname,
|
|
1131
|
+
static_cast<int>(bytes));
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1106
1134
|
// REQUIRES: wal_numbers are sorted in ascending order
|
|
1107
1135
|
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
1108
1136
|
SequenceNumber* next_sequence, bool read_only,
|
|
1109
1137
|
bool is_retry, bool* corrupted_wal_found,
|
|
1110
1138
|
RecoveryContext* recovery_ctx) {
|
|
1111
|
-
struct LogReporter : public log::Reader::Reporter {
|
|
1112
|
-
Env* env;
|
|
1113
|
-
Logger* info_log;
|
|
1114
|
-
const char* fname;
|
|
1115
|
-
Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
|
|
1116
|
-
bool* old_log_record;
|
|
1117
|
-
void Corruption(size_t bytes, const Status& s) override {
|
|
1118
|
-
ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
|
|
1119
|
-
(status == nullptr ? "(ignoring error) " : ""), fname,
|
|
1120
|
-
static_cast<int>(bytes), s.ToString().c_str());
|
|
1121
|
-
if (status != nullptr && status->ok()) {
|
|
1122
|
-
*status = s;
|
|
1123
|
-
}
|
|
1124
|
-
}
|
|
1125
|
-
|
|
1126
|
-
void OldLogRecord(size_t bytes) override {
|
|
1127
|
-
if (old_log_record != nullptr) {
|
|
1128
|
-
*old_log_record = true;
|
|
1129
|
-
}
|
|
1130
|
-
ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled",
|
|
1131
|
-
fname, static_cast<int>(bytes));
|
|
1132
|
-
}
|
|
1133
|
-
};
|
|
1134
|
-
|
|
1135
1139
|
mutex_.AssertHeld();
|
|
1136
|
-
|
|
1137
|
-
bool old_log_record = false;
|
|
1140
|
+
|
|
1138
1141
|
std::unordered_map<int, VersionEdit> version_edits;
|
|
1139
|
-
|
|
1142
|
+
int job_id = 0;
|
|
1143
|
+
uint64_t min_wal_number = 0;
|
|
1144
|
+
SetupLogFilesRecovery(wal_numbers, &version_edits, &job_id, &min_wal_number);
|
|
1145
|
+
|
|
1146
|
+
Status status = ProcessLogFiles(
|
|
1147
|
+
wal_numbers, read_only, is_retry, min_wal_number, job_id, next_sequence,
|
|
1148
|
+
&version_edits, corrupted_wal_found, recovery_ctx);
|
|
1149
|
+
|
|
1150
|
+
FinishLogFilesRecovery(job_id, status);
|
|
1151
|
+
return status;
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
void DBImpl::SetupLogFilesRecovery(
|
|
1155
|
+
const std::vector<uint64_t>& wal_numbers,
|
|
1156
|
+
std::unordered_map<int, VersionEdit>* version_edits, int* job_id,
|
|
1157
|
+
uint64_t* min_wal_number) {
|
|
1158
|
+
assert(version_edits);
|
|
1159
|
+
assert(job_id);
|
|
1160
|
+
assert(min_wal_number);
|
|
1161
|
+
// No need to refcount because iteration is under mutex
|
|
1140
1162
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
1141
1163
|
VersionEdit edit;
|
|
1142
1164
|
edit.SetColumnFamily(cfd->GetID());
|
|
1143
|
-
version_edits
|
|
1165
|
+
version_edits->insert({cfd->GetID(), edit});
|
|
1144
1166
|
}
|
|
1145
|
-
|
|
1167
|
+
|
|
1168
|
+
*job_id = next_job_id_.fetch_add(1);
|
|
1146
1169
|
{
|
|
1147
1170
|
auto stream = event_logger_.Log();
|
|
1148
|
-
stream << "job" << job_id
|
|
1149
|
-
|
|
1171
|
+
stream << "job" << *job_id;
|
|
1172
|
+
stream << "event" << "recovery_started";
|
|
1150
1173
|
stream << "wal_files";
|
|
1151
1174
|
stream.StartArray();
|
|
1152
1175
|
for (auto wal_number : wal_numbers) {
|
|
@@ -1158,265 +1181,538 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1158
1181
|
// No-op for immutable_db_options_.wal_filter == nullptr.
|
|
1159
1182
|
InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
|
|
1160
1183
|
|
|
1184
|
+
*min_wal_number = MinLogNumberToKeep();
|
|
1185
|
+
if (!allow_2pc()) {
|
|
1186
|
+
// In non-2pc mode, we skip WALs that do not back unflushed data.
|
|
1187
|
+
*min_wal_number =
|
|
1188
|
+
std::max(*min_wal_number, versions_->MinLogNumberWithUnflushedData());
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
Status DBImpl::ProcessLogFiles(
|
|
1193
|
+
const std::vector<uint64_t>& wal_numbers, bool read_only, bool is_retry,
|
|
1194
|
+
uint64_t min_wal_number, int job_id, SequenceNumber* next_sequence,
|
|
1195
|
+
std::unordered_map<int, VersionEdit>* version_edits,
|
|
1196
|
+
bool* corrupted_wal_found, RecoveryContext* recovery_ctx) {
|
|
1197
|
+
Status status;
|
|
1198
|
+
|
|
1161
1199
|
bool stop_replay_by_wal_filter = false;
|
|
1162
1200
|
bool stop_replay_for_corruption = false;
|
|
1163
1201
|
bool flushed = false;
|
|
1164
1202
|
uint64_t corrupted_wal_number = kMaxSequenceNumber;
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
// In non-2pc mode, we skip WALs that do not back unflushed data.
|
|
1168
|
-
min_wal_number =
|
|
1169
|
-
std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
|
|
1170
|
-
}
|
|
1203
|
+
PredecessorWALInfo predecessor_wal_info;
|
|
1204
|
+
|
|
1171
1205
|
for (auto wal_number : wal_numbers) {
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1206
|
+
// Detecting early break on the next iteration after `wal_number` has been
|
|
1207
|
+
// advanced since this `wal_number` doesn't affect follow-up handling after
|
|
1208
|
+
// breaking out of the for loop.
|
|
1209
|
+
if (!status.ok()) {
|
|
1210
|
+
break;
|
|
1211
|
+
}
|
|
1212
|
+
SequenceNumber prev_next_sequence = *next_sequence;
|
|
1213
|
+
if (status.ok()) {
|
|
1214
|
+
status = ProcessLogFile(
|
|
1215
|
+
wal_number, min_wal_number, is_retry, read_only, job_id,
|
|
1216
|
+
next_sequence, &stop_replay_for_corruption,
|
|
1217
|
+
&stop_replay_by_wal_filter, &corrupted_wal_number,
|
|
1218
|
+
corrupted_wal_found, version_edits, &flushed, predecessor_wal_info);
|
|
1219
|
+
}
|
|
1220
|
+
if (status.ok()) {
|
|
1221
|
+
status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence,
|
|
1222
|
+
*next_sequence);
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
if (status.ok()) {
|
|
1227
|
+
status = MaybeHandleStopReplayForCorruptionForInconsistency(
|
|
1228
|
+
stop_replay_for_corruption, corrupted_wal_number);
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
if (status.ok()) {
|
|
1232
|
+
status = MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
|
|
1233
|
+
wal_numbers, read_only, job_id, flushed, version_edits, recovery_ctx);
|
|
1234
|
+
}
|
|
1235
|
+
return status;
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1238
|
+
Status DBImpl::ProcessLogFile(
|
|
1239
|
+
uint64_t wal_number, uint64_t min_wal_number, bool is_retry, bool read_only,
|
|
1240
|
+
int job_id, SequenceNumber* next_sequence, bool* stop_replay_for_corruption,
|
|
1241
|
+
bool* stop_replay_by_wal_filter, uint64_t* corrupted_wal_number,
|
|
1242
|
+
bool* corrupted_wal_found,
|
|
1243
|
+
std::unordered_map<int, VersionEdit>* version_edits, bool* flushed,
|
|
1244
|
+
PredecessorWALInfo& predecessor_wal_info) {
|
|
1245
|
+
assert(stop_replay_by_wal_filter);
|
|
1246
|
+
|
|
1247
|
+
// Variable initialization starts
|
|
1248
|
+
Status status;
|
|
1249
|
+
bool old_log_record = false;
|
|
1250
|
+
|
|
1251
|
+
DBOpenLogRecordReadReporter reporter;
|
|
1252
|
+
std::unique_ptr<log::Reader> reader;
|
|
1253
|
+
|
|
1254
|
+
std::string fname =
|
|
1255
|
+
LogFileName(immutable_db_options_.GetWalDir(), wal_number);
|
|
1256
|
+
|
|
1257
|
+
auto logFileDropped = [this, &fname]() {
|
|
1258
|
+
uint64_t bytes;
|
|
1259
|
+
if (env_->GetFileSize(fname, &bytes).ok()) {
|
|
1260
|
+
auto info_log = immutable_db_options_.info_log.get();
|
|
1261
|
+
ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
|
|
1262
|
+
static_cast<int>(bytes));
|
|
1178
1263
|
}
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1264
|
+
};
|
|
1265
|
+
|
|
1266
|
+
std::string scratch;
|
|
1267
|
+
Slice record;
|
|
1268
|
+
uint64_t record_checksum;
|
|
1269
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz =
|
|
1270
|
+
versions_->GetRunningColumnFamiliesTimestampSize();
|
|
1186
1271
|
|
|
1272
|
+
// We need to track `last_seqno_observed` in addition to `next_sequence` since
|
|
1273
|
+
// `last_seqno_observed != *next_sequence` when there are multiple key-value
|
|
1274
|
+
// pairs in one WAL entry
|
|
1275
|
+
SequenceNumber last_seqno_observed = 0;
|
|
1276
|
+
// Variable initialization ends
|
|
1277
|
+
|
|
1278
|
+
if (wal_number < min_wal_number) {
|
|
1187
1279
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1188
|
-
"
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1280
|
+
"Skipping log #%" PRIu64
|
|
1281
|
+
" since it is older than min log to keep #%" PRIu64,
|
|
1282
|
+
wal_number, min_wal_number);
|
|
1283
|
+
assert(status.ok());
|
|
1284
|
+
return status;
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
SetupLogFileProcessing(wal_number);
|
|
1288
|
+
|
|
1289
|
+
if (*stop_replay_by_wal_filter) {
|
|
1290
|
+
logFileDropped();
|
|
1291
|
+
assert(status.ok());
|
|
1292
|
+
return status;
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
Status init_status = InitializeLogReader(
|
|
1296
|
+
wal_number, is_retry, fname, *stop_replay_for_corruption, min_wal_number,
|
|
1297
|
+
predecessor_wal_info, &old_log_record, &status, &reporter, reader);
|
|
1298
|
+
|
|
1299
|
+
// FIXME(hx235): Consolidate `!init_status.ok()` and `reader == nullptr` cases
|
|
1300
|
+
if (!init_status.ok()) {
|
|
1301
|
+
assert(status.ok());
|
|
1302
|
+
status.PermitUncheckedError();
|
|
1303
|
+
return init_status;
|
|
1304
|
+
} else if (reader == nullptr) {
|
|
1305
|
+
// TODO(hx235): remove this case since it's confusing
|
|
1306
|
+
assert(status.ok());
|
|
1307
|
+
// Fail initializing log reader for one log file with an ok status.
|
|
1308
|
+
// Try next one.
|
|
1309
|
+
return status;
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
|
|
1313
|
+
/*cb_arg=*/nullptr);
|
|
1314
|
+
while (true) {
|
|
1315
|
+
if (*stop_replay_by_wal_filter) {
|
|
1316
|
+
break;
|
|
1201
1317
|
}
|
|
1202
1318
|
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
status = fs_->NewSequentialFile(
|
|
1207
|
-
fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
|
|
1208
|
-
if (!status.ok()) {
|
|
1209
|
-
MaybeIgnoreError(&status);
|
|
1210
|
-
if (!status.ok()) {
|
|
1211
|
-
return status;
|
|
1212
|
-
} else {
|
|
1213
|
-
// Fail with one log file, but that's ok.
|
|
1214
|
-
// Try next one.
|
|
1215
|
-
continue;
|
|
1216
|
-
}
|
|
1217
|
-
}
|
|
1218
|
-
file_reader.reset(new SequentialFileReader(
|
|
1219
|
-
std::move(file), fname, immutable_db_options_.log_readahead_size,
|
|
1220
|
-
io_tracer_, /*listeners=*/{}, /*rate_limiter=*/nullptr, is_retry));
|
|
1221
|
-
}
|
|
1222
|
-
|
|
1223
|
-
// Create the log reader.
|
|
1224
|
-
LogReporter reporter;
|
|
1225
|
-
reporter.env = env_;
|
|
1226
|
-
reporter.info_log = immutable_db_options_.info_log.get();
|
|
1227
|
-
reporter.fname = fname.c_str();
|
|
1228
|
-
reporter.old_log_record = &old_log_record;
|
|
1229
|
-
if (!immutable_db_options_.paranoid_checks ||
|
|
1230
|
-
immutable_db_options_.wal_recovery_mode ==
|
|
1231
|
-
WALRecoveryMode::kSkipAnyCorruptedRecords) {
|
|
1232
|
-
reporter.status = nullptr;
|
|
1233
|
-
} else {
|
|
1234
|
-
reporter.status = &status;
|
|
1235
|
-
}
|
|
1236
|
-
// We intentially make log::Reader do checksumming even if
|
|
1237
|
-
// paranoid_checks==false so that corruptions cause entire commits
|
|
1238
|
-
// to be skipped instead of propagating bad information (like overly
|
|
1239
|
-
// large sequence numbers).
|
|
1240
|
-
log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
|
|
1241
|
-
&reporter, true /*checksum*/, wal_number);
|
|
1242
|
-
|
|
1243
|
-
// Determine if we should tolerate incomplete records at the tail end of the
|
|
1244
|
-
// Read all the records and add to a memtable
|
|
1245
|
-
std::string scratch;
|
|
1246
|
-
Slice record;
|
|
1247
|
-
|
|
1248
|
-
const UnorderedMap<uint32_t, size_t>& running_ts_sz =
|
|
1249
|
-
versions_->GetRunningColumnFamiliesTimestampSize();
|
|
1250
|
-
|
|
1251
|
-
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
|
|
1252
|
-
/*arg=*/nullptr);
|
|
1253
|
-
uint64_t record_checksum;
|
|
1254
|
-
while (!stop_replay_by_wal_filter &&
|
|
1255
|
-
reader.ReadRecord(&record, &scratch,
|
|
1256
|
-
immutable_db_options_.wal_recovery_mode,
|
|
1257
|
-
&record_checksum) &&
|
|
1258
|
-
status.ok()) {
|
|
1259
|
-
if (record.size() < WriteBatchInternal::kHeader) {
|
|
1260
|
-
reporter.Corruption(record.size(),
|
|
1261
|
-
Status::Corruption("log record too small"));
|
|
1262
|
-
continue;
|
|
1263
|
-
}
|
|
1264
|
-
// We create a new batch and initialize with a valid prot_info_ to store
|
|
1265
|
-
// the data checksums
|
|
1266
|
-
WriteBatch batch;
|
|
1267
|
-
std::unique_ptr<WriteBatch> new_batch;
|
|
1319
|
+
bool read_record = reader->ReadRecord(
|
|
1320
|
+
&record, &scratch, immutable_db_options_.wal_recovery_mode,
|
|
1321
|
+
&record_checksum);
|
|
1268
1322
|
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1323
|
+
// `reader->ReadRecord` will change `status` through reporter in `reader`
|
|
1324
|
+
// when a corruption is encountered
|
|
1325
|
+
// FIXME(hx235): consolidate `read_record` and `status`
|
|
1326
|
+
if (!read_record || !status.ok()) {
|
|
1327
|
+
break;
|
|
1328
|
+
}
|
|
1273
1329
|
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1330
|
+
// FIXME(hx235): consolidate `process_status` and `status`
|
|
1331
|
+
SequenceNumber prev_next_sequence = *next_sequence;
|
|
1332
|
+
Status process_status = ProcessLogRecord(
|
|
1333
|
+
record, reader, running_ts_sz, wal_number, fname, read_only, job_id,
|
|
1334
|
+
logFileDropped, &reporter, &record_checksum, &last_seqno_observed,
|
|
1335
|
+
next_sequence, stop_replay_for_corruption, &status,
|
|
1336
|
+
stop_replay_by_wal_filter, version_edits, flushed);
|
|
1337
|
+
|
|
1338
|
+
if (!process_status.ok()) {
|
|
1339
|
+
return process_status;
|
|
1340
|
+
} else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery(
|
|
1341
|
+
prev_next_sequence, *next_sequence);
|
|
1342
|
+
!seqno_check_status.ok()) {
|
|
1343
|
+
// Sequence number being set back indicates a serious software bug, the DB
|
|
1344
|
+
// should not be opened in this case.
|
|
1345
|
+
return seqno_check_status;
|
|
1346
|
+
} else if (*stop_replay_for_corruption) {
|
|
1347
|
+
break;
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1283
1350
|
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
|
|
1288
|
-
batch_to_use);
|
|
1289
|
-
TEST_SYNC_POINT_CALLBACK(
|
|
1290
|
-
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
|
|
1291
|
-
&record_checksum);
|
|
1292
|
-
status = WriteBatchInternal::UpdateProtectionInfo(
|
|
1293
|
-
batch_to_use, 8 /* bytes_per_key */,
|
|
1294
|
-
batch_updated ? nullptr : &record_checksum);
|
|
1295
|
-
if (!status.ok()) {
|
|
1296
|
-
return status;
|
|
1297
|
-
}
|
|
1351
|
+
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1352
|
+
"Recovered to log #%" PRIu64 " next seq #%" PRIu64, wal_number,
|
|
1353
|
+
*next_sequence);
|
|
1298
1354
|
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
Status::Corruption("sequence " + std::to_string(sequence) +
|
|
1304
|
-
" is too large"));
|
|
1305
|
-
continue;
|
|
1306
|
-
}
|
|
1355
|
+
if (status.ok()) {
|
|
1356
|
+
status = UpdatePredecessorWALInfo(wal_number, last_seqno_observed, fname,
|
|
1357
|
+
predecessor_wal_info);
|
|
1358
|
+
}
|
|
1307
1359
|
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
// will start from the last sequence id we recovered.
|
|
1314
|
-
if (sequence == *next_sequence) {
|
|
1315
|
-
stop_replay_for_corruption = false;
|
|
1316
|
-
}
|
|
1317
|
-
if (stop_replay_for_corruption) {
|
|
1318
|
-
logFileDropped();
|
|
1319
|
-
break;
|
|
1320
|
-
}
|
|
1321
|
-
}
|
|
1360
|
+
if (!status.ok() || old_log_record) {
|
|
1361
|
+
status = HandleNonOkStatusOrOldLogRecord(
|
|
1362
|
+
wal_number, next_sequence, status, reporter, &old_log_record,
|
|
1363
|
+
stop_replay_for_corruption, corrupted_wal_number, corrupted_wal_found);
|
|
1364
|
+
}
|
|
1322
1365
|
|
|
1323
|
-
|
|
1324
|
-
// and returns true.
|
|
1325
|
-
if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter,
|
|
1326
|
-
status, stop_replay_by_wal_filter,
|
|
1327
|
-
*batch_to_use)) {
|
|
1328
|
-
continue;
|
|
1329
|
-
}
|
|
1366
|
+
FinishLogFileProcessing(status, next_sequence);
|
|
1330
1367
|
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
// insert. We don't want to fail the whole write batch in that case --
|
|
1334
|
-
// we just ignore the update.
|
|
1335
|
-
// That's why we set ignore missing column families to true
|
|
1336
|
-
bool has_valid_writes = false;
|
|
1337
|
-
status = WriteBatchInternal::InsertInto(
|
|
1338
|
-
batch_to_use, column_family_memtables_.get(), &flush_scheduler_,
|
|
1339
|
-
&trim_history_scheduler_, true, wal_number, this,
|
|
1340
|
-
false /* concurrent_memtable_writes */, next_sequence,
|
|
1341
|
-
&has_valid_writes, seq_per_batch_, batch_per_txn_);
|
|
1342
|
-
MaybeIgnoreError(&status);
|
|
1343
|
-
if (!status.ok()) {
|
|
1344
|
-
// We are treating this as a failure while reading since we read valid
|
|
1345
|
-
// blocks that do not form coherent data
|
|
1346
|
-
reporter.Corruption(record.size(), status);
|
|
1347
|
-
continue;
|
|
1348
|
-
}
|
|
1368
|
+
return status;
|
|
1369
|
+
}
|
|
1349
1370
|
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
|
|
1356
|
-
cfd->UnrefAndTryDelete();
|
|
1357
|
-
// If this asserts, it means that InsertInto failed in
|
|
1358
|
-
// filtering updates to already-flushed column families
|
|
1359
|
-
assert(cfd->GetLogNumber() <= wal_number);
|
|
1360
|
-
auto iter = version_edits.find(cfd->GetID());
|
|
1361
|
-
assert(iter != version_edits.end());
|
|
1362
|
-
VersionEdit* edit = &iter->second;
|
|
1363
|
-
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
|
|
1364
|
-
if (!status.ok()) {
|
|
1365
|
-
// Reflect errors immediately so that conditions like full
|
|
1366
|
-
// file-systems cause the DB::Open() to fail.
|
|
1367
|
-
return status;
|
|
1368
|
-
}
|
|
1369
|
-
flushed = true;
|
|
1371
|
+
void DBImpl::SetupLogFileProcessing(uint64_t wal_number) {
|
|
1372
|
+
// The previous incarnation may not have written any MANIFEST
|
|
1373
|
+
// records after allocating this log number. So we manually
|
|
1374
|
+
// update the file number allocation counter in VersionSet.
|
|
1375
|
+
versions_->MarkFileNumberUsed(wal_number);
|
|
1370
1376
|
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1377
|
+
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1378
|
+
"Recovering log #%" PRIu64 " mode %d", wal_number,
|
|
1379
|
+
static_cast<int>(immutable_db_options_.wal_recovery_mode));
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
Status DBImpl::InitializeLogReader(
|
|
1383
|
+
uint64_t wal_number, bool is_retry, std::string& fname,
|
|
1384
|
+
bool stop_replay_for_corruption, uint64_t min_wal_number,
|
|
1385
|
+
const PredecessorWALInfo& predecessor_wal_info, bool* const old_log_record,
|
|
1386
|
+
Status* const reporter_status, DBOpenLogRecordReadReporter* reporter,
|
|
1387
|
+
std::unique_ptr<log::Reader>& reader) {
|
|
1388
|
+
assert(old_log_record);
|
|
1389
|
+
assert(reporter_status);
|
|
1390
|
+
assert(reporter);
|
|
1391
|
+
|
|
1392
|
+
Status status;
|
|
1393
|
+
|
|
1394
|
+
std::unique_ptr<SequentialFileReader> file_reader;
|
|
1395
|
+
{
|
|
1396
|
+
std::unique_ptr<FSSequentialFile> file;
|
|
1397
|
+
status = fs_->NewSequentialFile(
|
|
1398
|
+
fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
|
|
1399
|
+
if (!status.ok()) {
|
|
1400
|
+
MaybeIgnoreError(&status);
|
|
1401
|
+
return status;
|
|
1375
1402
|
}
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1403
|
+
file_reader.reset(new SequentialFileReader(
|
|
1404
|
+
std::move(file), fname, immutable_db_options_.log_readahead_size,
|
|
1405
|
+
io_tracer_, /*listeners=*/{}, /*rate_limiter=*/nullptr,
|
|
1406
|
+
/*verify_and_reconstruct_read=*/is_retry));
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
// Create the log reader.
|
|
1410
|
+
reporter->env = env_;
|
|
1411
|
+
reporter->info_log = immutable_db_options_.info_log.get();
|
|
1412
|
+
reporter->fname = fname.c_str();
|
|
1413
|
+
reporter->old_log_record = old_log_record;
|
|
1414
|
+
if (!immutable_db_options_.paranoid_checks ||
|
|
1415
|
+
immutable_db_options_.wal_recovery_mode ==
|
|
1388
1416
|
WALRecoveryMode::kSkipAnyCorruptedRecords) {
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1417
|
+
reporter->status = nullptr;
|
|
1418
|
+
} else {
|
|
1419
|
+
reporter->status = reporter_status;
|
|
1420
|
+
}
|
|
1421
|
+
// We intentially make log::Reader do checksumming even if
|
|
1422
|
+
// paranoid_checks==false so that corruptions cause entire commits
|
|
1423
|
+
// to be skipped instead of propagating bad information (like overly
|
|
1424
|
+
// large sequence numbers).
|
|
1425
|
+
reader.reset(new log::Reader(
|
|
1426
|
+
immutable_db_options_.info_log, std::move(file_reader), reporter,
|
|
1427
|
+
true /*checksum*/, wal_number,
|
|
1428
|
+
immutable_db_options_.track_and_verify_wals, stop_replay_for_corruption,
|
|
1429
|
+
min_wal_number, predecessor_wal_info));
|
|
1430
|
+
return status;
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
Status DBImpl::ProcessLogRecord(
|
|
1434
|
+
Slice record, const std::unique_ptr<log::Reader>& reader,
|
|
1435
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz, uint64_t wal_number,
|
|
1436
|
+
const std::string& fname, bool read_only, int job_id,
|
|
1437
|
+
const std::function<void()>& logFileDropped,
|
|
1438
|
+
DBOpenLogRecordReadReporter* reporter, uint64_t* record_checksum,
|
|
1439
|
+
SequenceNumber* last_seqno_observed, SequenceNumber* next_sequence,
|
|
1440
|
+
bool* stop_replay_for_corruption, Status* status,
|
|
1441
|
+
bool* stop_replay_by_wal_filter,
|
|
1442
|
+
std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) {
|
|
1443
|
+
assert(reporter);
|
|
1444
|
+
assert(last_seqno_observed);
|
|
1445
|
+
assert(stop_replay_for_corruption);
|
|
1446
|
+
assert(status);
|
|
1447
|
+
assert(stop_replay_by_wal_filter);
|
|
1448
|
+
|
|
1449
|
+
Status process_status;
|
|
1450
|
+
bool has_valid_writes = false;
|
|
1451
|
+
WriteBatch batch;
|
|
1452
|
+
std::unique_ptr<WriteBatch> new_batch;
|
|
1453
|
+
WriteBatch* batch_to_use = nullptr;
|
|
1454
|
+
|
|
1455
|
+
if (record.size() < WriteBatchInternal::kHeader) {
|
|
1456
|
+
reporter->Corruption(record.size(),
|
|
1457
|
+
Status::Corruption("log record too small"));
|
|
1458
|
+
assert(process_status.ok());
|
|
1459
|
+
return process_status;
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
process_status = InitializeWriteBatchForLogRecord(
|
|
1463
|
+
record, reader, running_ts_sz, &batch, new_batch, batch_to_use,
|
|
1464
|
+
record_checksum);
|
|
1465
|
+
if (!process_status.ok()) {
|
|
1466
|
+
return process_status;
|
|
1467
|
+
}
|
|
1468
|
+
assert(batch_to_use);
|
|
1469
|
+
|
|
1470
|
+
*last_seqno_observed = WriteBatchInternal::Sequence(batch_to_use);
|
|
1471
|
+
|
|
1472
|
+
if (*last_seqno_observed > kMaxSequenceNumber) {
|
|
1473
|
+
reporter->Corruption(
|
|
1474
|
+
record.size(),
|
|
1475
|
+
Status::Corruption("sequence " + std::to_string(*last_seqno_observed) +
|
|
1476
|
+
" is too large"));
|
|
1477
|
+
assert(process_status.ok());
|
|
1478
|
+
return process_status;
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
MaybeReviseStopReplayForCorruption(*last_seqno_observed, next_sequence,
|
|
1482
|
+
stop_replay_for_corruption);
|
|
1483
|
+
if (*stop_replay_for_corruption) {
|
|
1484
|
+
logFileDropped();
|
|
1485
|
+
assert(process_status.ok());
|
|
1486
|
+
return process_status;
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1489
|
+
// For the default case of wal_filter == nullptr, always performs no-op
|
|
1490
|
+
// and returns true.
|
|
1491
|
+
if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, *reporter, *status,
|
|
1492
|
+
*stop_replay_by_wal_filter,
|
|
1493
|
+
*batch_to_use)) {
|
|
1494
|
+
assert(process_status.ok());
|
|
1495
|
+
return process_status;
|
|
1496
|
+
} else {
|
|
1497
|
+
// FIXME(hx235): Handle the potential non-okay `status` when
|
|
1498
|
+
// `InvokeWalFilterIfNeededOnWalRecord()` returns true
|
|
1499
|
+
status->PermitUncheckedError();
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
assert(process_status.ok());
|
|
1503
|
+
process_status = InsertLogRecordToMemtable(batch_to_use, wal_number,
|
|
1504
|
+
next_sequence, &has_valid_writes);
|
|
1505
|
+
MaybeIgnoreError(&process_status);
|
|
1506
|
+
// We are treating this as a failure while reading since we read valid
|
|
1507
|
+
// blocks that do not form coherent data
|
|
1508
|
+
if (!process_status.ok()) {
|
|
1509
|
+
// FIXME(hx235): `reporter->Corruption()` will override the non-ok status
|
|
1510
|
+
// set in `InvokeWalFilterIfNeededOnWalRecord` through passing `*status`
|
|
1511
|
+
reporter->Corruption(record.size(), process_status);
|
|
1512
|
+
process_status = Status::OK();
|
|
1513
|
+
return process_status;
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
process_status = MaybeWriteLevel0TableForRecovery(
|
|
1517
|
+
has_valid_writes, read_only, wal_number, job_id, next_sequence,
|
|
1518
|
+
version_edits, flushed);
|
|
1519
|
+
|
|
1520
|
+
return process_status;
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
// We create a new batch and initialize with a valid prot_info_ to store
|
|
1524
|
+
// the data checksum
|
|
1525
|
+
Status DBImpl::InitializeWriteBatchForLogRecord(
|
|
1526
|
+
Slice record, const std::unique_ptr<log::Reader>& reader,
|
|
1527
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz, WriteBatch* batch,
|
|
1528
|
+
std::unique_ptr<WriteBatch>& new_batch, WriteBatch*& batch_to_use,
|
|
1529
|
+
uint64_t* record_checksum) {
|
|
1530
|
+
assert(batch);
|
|
1531
|
+
assert(record_checksum);
|
|
1532
|
+
|
|
1533
|
+
Status status = WriteBatchInternal::SetContents(batch, record);
|
|
1534
|
+
if (!status.ok()) {
|
|
1535
|
+
return status;
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1538
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz =
|
|
1539
|
+
reader->GetRecordedTimestampSize();
|
|
1540
|
+
status = HandleWriteBatchTimestampSizeDifference(
|
|
1541
|
+
batch, running_ts_sz, record_ts_sz,
|
|
1542
|
+
TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
|
|
1543
|
+
batch_per_txn_, &new_batch);
|
|
1544
|
+
if (!status.ok()) {
|
|
1545
|
+
return status;
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
bool batch_updated = new_batch != nullptr;
|
|
1549
|
+
batch_to_use = batch_updated ? new_batch.get() : batch;
|
|
1550
|
+
TEST_SYNC_POINT_CALLBACK(
|
|
1551
|
+
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", batch_to_use);
|
|
1552
|
+
TEST_SYNC_POINT_CALLBACK(
|
|
1553
|
+
"DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
|
|
1554
|
+
record_checksum);
|
|
1555
|
+
status = WriteBatchInternal::UpdateProtectionInfo(
|
|
1556
|
+
batch_to_use, 8 /* bytes_per_key */,
|
|
1557
|
+
batch_updated ? nullptr : record_checksum);
|
|
1558
|
+
|
|
1559
|
+
return status;
|
|
1560
|
+
}
|
|
1561
|
+
|
|
1562
|
+
void DBImpl::MaybeReviseStopReplayForCorruption(
|
|
1563
|
+
SequenceNumber sequence, SequenceNumber const* const next_sequence,
|
|
1564
|
+
bool* stop_replay_for_corruption) {
|
|
1565
|
+
if (immutable_db_options_.wal_recovery_mode ==
|
|
1566
|
+
WALRecoveryMode::kPointInTimeRecovery) {
|
|
1567
|
+
assert(next_sequence);
|
|
1568
|
+
assert(stop_replay_for_corruption);
|
|
1569
|
+
// In point-in-time recovery mode, if sequence id of log files are
|
|
1570
|
+
// consecutive, we continue recovery despite corruption. This could
|
|
1571
|
+
// happen when we open and write to a corrupted DB, where sequence id
|
|
1572
|
+
// will start from the last sequence id we recovered.
|
|
1573
|
+
if (sequence == *next_sequence) {
|
|
1574
|
+
*stop_replay_for_corruption = false;
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
Status DBImpl::InsertLogRecordToMemtable(WriteBatch* batch_to_use,
|
|
1580
|
+
uint64_t wal_number,
|
|
1581
|
+
SequenceNumber* next_sequence,
|
|
1582
|
+
bool* has_valid_writes) {
|
|
1583
|
+
// If column family was not found, it might mean that the WAL write
|
|
1584
|
+
// batch references to the column family that was dropped after the
|
|
1585
|
+
// insert. We don't want to fail the whole write batch in that case --
|
|
1586
|
+
// we just ignore the update.
|
|
1587
|
+
// That's why we set ignore missing column families to true
|
|
1588
|
+
assert(batch_to_use);
|
|
1589
|
+
assert(has_valid_writes);
|
|
1590
|
+
Status status = WriteBatchInternal::InsertInto(
|
|
1591
|
+
batch_to_use, column_family_memtables_.get(), &flush_scheduler_,
|
|
1592
|
+
&trim_history_scheduler_, true, wal_number, this,
|
|
1593
|
+
false /* concurrent_memtable_writes */, next_sequence, has_valid_writes,
|
|
1594
|
+
seq_per_batch_, batch_per_txn_);
|
|
1595
|
+
return status;
|
|
1596
|
+
}
|
|
1597
|
+
|
|
1598
|
+
Status DBImpl::MaybeWriteLevel0TableForRecovery(
|
|
1599
|
+
bool has_valid_writes, bool read_only, uint64_t wal_number, int job_id,
|
|
1600
|
+
SequenceNumber const* const next_sequence,
|
|
1601
|
+
std::unordered_map<int, VersionEdit>* version_edits, bool* flushed) {
|
|
1602
|
+
assert(next_sequence);
|
|
1603
|
+
assert(version_edits);
|
|
1604
|
+
assert(flushed);
|
|
1605
|
+
|
|
1606
|
+
Status status;
|
|
1607
|
+
if (has_valid_writes && !read_only) {
|
|
1608
|
+
// we can do this because this is called before client has access to the
|
|
1609
|
+
// DB and there is only a single thread operating on DB
|
|
1610
|
+
ColumnFamilyData* cfd;
|
|
1611
|
+
|
|
1612
|
+
while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
|
|
1613
|
+
cfd->UnrefAndTryDelete();
|
|
1614
|
+
// If this asserts, it means that InsertInto failed in
|
|
1615
|
+
// filtering updates to already-flushed column families
|
|
1616
|
+
assert(cfd->GetLogNumber() <= wal_number);
|
|
1617
|
+
(void)wal_number;
|
|
1618
|
+
auto iter = version_edits->find(cfd->GetID());
|
|
1619
|
+
assert(iter != version_edits->end());
|
|
1620
|
+
VersionEdit* edit = &iter->second;
|
|
1621
|
+
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
|
|
1622
|
+
if (!status.ok()) {
|
|
1623
|
+
// Reflect errors immediately so that conditions like full
|
|
1624
|
+
// file-systems cause the DB::Open() to fail.
|
|
1416
1625
|
return status;
|
|
1417
1626
|
}
|
|
1627
|
+
*flushed = true;
|
|
1628
|
+
|
|
1629
|
+
cfd->CreateNewMemtable(*next_sequence - 1);
|
|
1630
|
+
}
|
|
1631
|
+
}
|
|
1632
|
+
return status;
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
Status DBImpl::HandleNonOkStatusOrOldLogRecord(
|
|
1636
|
+
uint64_t wal_number, SequenceNumber const* const next_sequence,
|
|
1637
|
+
Status status, const DBOpenLogRecordReadReporter& reporter,
|
|
1638
|
+
bool* old_log_record, bool* stop_replay_for_corruption,
|
|
1639
|
+
uint64_t* corrupted_wal_number, bool* corrupted_wal_found) {
|
|
1640
|
+
assert(!status.ok() || *old_log_record);
|
|
1641
|
+
|
|
1642
|
+
assert(next_sequence);
|
|
1643
|
+
assert(old_log_record);
|
|
1644
|
+
assert(stop_replay_for_corruption);
|
|
1645
|
+
assert(corrupted_wal_number);
|
|
1646
|
+
|
|
1647
|
+
if (status.IsNotSupported()) {
|
|
1648
|
+
// We should not treat NotSupported as corruption. It is rather a clear
|
|
1649
|
+
// sign that we are processing a WAL that is produced by an incompatible
|
|
1650
|
+
// version of the code.
|
|
1651
|
+
return status;
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
if (immutable_db_options_.wal_recovery_mode ==
|
|
1655
|
+
WALRecoveryMode::kSkipAnyCorruptedRecords) {
|
|
1656
|
+
// We should ignore all errors unconditionally
|
|
1657
|
+
return Status::OK();
|
|
1658
|
+
} else if (immutable_db_options_.wal_recovery_mode ==
|
|
1659
|
+
WALRecoveryMode::kPointInTimeRecovery) {
|
|
1660
|
+
if (status.IsIOError()) {
|
|
1661
|
+
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
|
|
1662
|
+
"IOError during point-in-time reading log #%" PRIu64
|
|
1663
|
+
" seq #%" PRIu64
|
|
1664
|
+
". %s. This likely mean loss of synced WAL, "
|
|
1665
|
+
"thus recovery fails.",
|
|
1666
|
+
wal_number, *next_sequence, status.ToString().c_str());
|
|
1667
|
+
return status;
|
|
1668
|
+
}
|
|
1669
|
+
// We should ignore the error but not continue replaying
|
|
1670
|
+
*old_log_record = false;
|
|
1671
|
+
*stop_replay_for_corruption = true;
|
|
1672
|
+
// TODO(hx235): have a single source of corrupted WAL number once we
|
|
1673
|
+
// consolidate the statuses
|
|
1674
|
+
uint64_t reporter_corrupted_wal_number = reporter.GetCorruptedLogNumber();
|
|
1675
|
+
*corrupted_wal_number = reporter_corrupted_wal_number != kMaxSequenceNumber
|
|
1676
|
+
? reporter_corrupted_wal_number
|
|
1677
|
+
: wal_number;
|
|
1678
|
+
if (corrupted_wal_found != nullptr) {
|
|
1679
|
+
*corrupted_wal_found = true;
|
|
1418
1680
|
}
|
|
1681
|
+
return Status::OK();
|
|
1682
|
+
} else {
|
|
1683
|
+
assert(immutable_db_options_.wal_recovery_mode ==
|
|
1684
|
+
WALRecoveryMode::kTolerateCorruptedTailRecords ||
|
|
1685
|
+
immutable_db_options_.wal_recovery_mode ==
|
|
1686
|
+
WALRecoveryMode::kAbsoluteConsistency);
|
|
1687
|
+
return status;
|
|
1688
|
+
}
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
Status DBImpl::UpdatePredecessorWALInfo(
|
|
1692
|
+
uint64_t wal_number, const SequenceNumber last_seqno_observed,
|
|
1693
|
+
const std::string& fname, PredecessorWALInfo& predecessor_wal_info) {
|
|
1694
|
+
uint64_t bytes;
|
|
1419
1695
|
|
|
1696
|
+
Status s = env_->GetFileSize(fname, &bytes);
|
|
1697
|
+
if (!s.ok()) {
|
|
1698
|
+
return s;
|
|
1699
|
+
}
|
|
1700
|
+
|
|
1701
|
+
SequenceNumber mock_seqno = kMaxSequenceNumber;
|
|
1702
|
+
[[maybe_unused]] std::pair<uint64_t, SequenceNumber*> pair =
|
|
1703
|
+
std::make_pair(wal_number, &mock_seqno);
|
|
1704
|
+
TEST_SYNC_POINT_CALLBACK("DBImpl::UpdatePredecessorWALInfo", &pair);
|
|
1705
|
+
predecessor_wal_info = PredecessorWALInfo(
|
|
1706
|
+
wal_number, bytes,
|
|
1707
|
+
mock_seqno != kMaxSequenceNumber ? mock_seqno : last_seqno_observed);
|
|
1708
|
+
|
|
1709
|
+
return s;
|
|
1710
|
+
}
|
|
1711
|
+
|
|
1712
|
+
void DBImpl::FinishLogFileProcessing(const Status& status,
|
|
1713
|
+
const SequenceNumber* next_sequence) {
|
|
1714
|
+
if (status.ok()) {
|
|
1715
|
+
assert(next_sequence);
|
|
1420
1716
|
flush_scheduler_.Clear();
|
|
1421
1717
|
trim_history_scheduler_.Clear();
|
|
1422
1718
|
auto last_sequence = *next_sequence - 1;
|
|
@@ -1427,6 +1723,12 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1427
1723
|
versions_->SetLastSequence(last_sequence);
|
|
1428
1724
|
}
|
|
1429
1725
|
}
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency(
|
|
1729
|
+
bool stop_replay_for_corruption, uint64_t corrupted_wal_number) {
|
|
1730
|
+
Status status;
|
|
1731
|
+
|
|
1430
1732
|
// Compare the corrupted log number to all columnfamily's current log number.
|
|
1431
1733
|
// Abort Open() if any column family's log number is greater than
|
|
1432
1734
|
// the corrupted log number, which means CF contains data beyond the point of
|
|
@@ -1462,12 +1764,22 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1462
1764
|
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
|
|
1463
1765
|
"Column family inconsistency: SST file contains data"
|
|
1464
1766
|
" beyond the point of corruption.");
|
|
1465
|
-
|
|
1466
|
-
|
|
1767
|
+
status = Status::Corruption("SST file is ahead of WALs in CF " +
|
|
1768
|
+
cfd->GetName());
|
|
1769
|
+
return status;
|
|
1467
1770
|
}
|
|
1468
1771
|
}
|
|
1469
1772
|
}
|
|
1773
|
+
return status;
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
|
|
1777
|
+
const std::vector<uint64_t>& wal_numbers, bool read_only, int job_id,
|
|
1778
|
+
bool flushed, std::unordered_map<int, VersionEdit>* version_edits,
|
|
1779
|
+
RecoveryContext* recovery_ctx) {
|
|
1780
|
+
assert(version_edits);
|
|
1470
1781
|
|
|
1782
|
+
Status status;
|
|
1471
1783
|
// True if there's any data in the WALs; if not, we can skip re-processing
|
|
1472
1784
|
// them later
|
|
1473
1785
|
bool data_seen = false;
|
|
@@ -1476,8 +1788,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1476
1788
|
// to the DB and can not drop column families while we iterate
|
|
1477
1789
|
const WalNumber max_wal_number = wal_numbers.back();
|
|
1478
1790
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
1479
|
-
auto iter = version_edits
|
|
1480
|
-
assert(iter != version_edits
|
|
1791
|
+
auto iter = version_edits->find(cfd->GetID());
|
|
1792
|
+
assert(iter != version_edits->end());
|
|
1481
1793
|
VersionEdit* edit = &iter->second;
|
|
1482
1794
|
|
|
1483
1795
|
if (cfd->GetLogNumber() > max_wal_number) {
|
|
@@ -1506,8 +1818,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1506
1818
|
}
|
|
1507
1819
|
flushed = true;
|
|
1508
1820
|
|
|
1509
|
-
cfd->CreateNewMemtable(
|
|
1510
|
-
versions_->LastSequence());
|
|
1821
|
+
cfd->CreateNewMemtable(versions_->LastSequence());
|
|
1511
1822
|
}
|
|
1512
1823
|
data_seen = true;
|
|
1513
1824
|
}
|
|
@@ -1533,8 +1844,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1533
1844
|
assert(recovery_ctx != nullptr);
|
|
1534
1845
|
|
|
1535
1846
|
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
|
1536
|
-
auto iter = version_edits
|
|
1537
|
-
assert(iter != version_edits
|
|
1847
|
+
auto iter = version_edits->find(cfd->GetID());
|
|
1848
|
+
assert(iter != version_edits->end());
|
|
1538
1849
|
recovery_ctx->UpdateVersionEdits(cfd, iter->second);
|
|
1539
1850
|
}
|
|
1540
1851
|
|
|
@@ -1567,16 +1878,32 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|
|
1567
1878
|
.PermitUncheckedError();
|
|
1568
1879
|
}
|
|
1569
1880
|
}
|
|
1881
|
+
return status;
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
Status DBImpl::CheckSeqnoNotSetBackDuringRecovery(
|
|
1885
|
+
SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) {
|
|
1886
|
+
if (prev_next_seqno == kMaxSequenceNumber ||
|
|
1887
|
+
prev_next_seqno <= current_next_seqno) {
|
|
1888
|
+
return Status::OK();
|
|
1889
|
+
}
|
|
1890
|
+
std::string msg =
|
|
1891
|
+
"Sequence number is being set backwards during recovery, this is likely "
|
|
1892
|
+
"a software bug or a data corruption. Prev next seqno: " +
|
|
1893
|
+
std::to_string(prev_next_seqno) +
|
|
1894
|
+
" , current next seqno: " + std::to_string(current_next_seqno);
|
|
1895
|
+
return Status::Corruption(msg);
|
|
1896
|
+
}
|
|
1570
1897
|
|
|
1898
|
+
void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
|
|
1571
1899
|
event_logger_.Log() << "job" << job_id << "event"
|
|
1572
|
-
<< "recovery_finished"
|
|
1573
|
-
|
|
1574
|
-
return status;
|
|
1900
|
+
<< (status.ok() ? "recovery_finished" : "recovery_failed")
|
|
1901
|
+
<< "status" << status.ToString();
|
|
1575
1902
|
}
|
|
1576
1903
|
|
|
1577
1904
|
Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
|
|
1578
|
-
|
|
1579
|
-
|
|
1905
|
+
WalFileNumberSize* log_ptr) {
|
|
1906
|
+
WalFileNumberSize log(wal_number);
|
|
1580
1907
|
std::string fname =
|
|
1581
1908
|
LogFileName(immutable_db_options_.GetWalDir(), wal_number);
|
|
1582
1909
|
Status s;
|
|
@@ -1619,27 +1946,27 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
|
|
1619
1946
|
assert(immutable_db_options_.avoid_flush_during_recovery);
|
|
1620
1947
|
// Mark these as alive so they'll be considered for deletion later by
|
|
1621
1948
|
// FindObsoleteFiles()
|
|
1622
|
-
|
|
1623
|
-
|
|
1949
|
+
wals_total_size_.StoreRelaxed(0);
|
|
1950
|
+
wal_empty_ = false;
|
|
1624
1951
|
uint64_t min_wal_with_unflushed_data =
|
|
1625
1952
|
versions_->MinLogNumberWithUnflushedData();
|
|
1626
1953
|
for (auto wal_number : wal_numbers) {
|
|
1627
1954
|
if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
|
|
1628
1955
|
// In non-2pc mode, the WAL files not backing unflushed data are not
|
|
1629
|
-
// alive, thus should not be added to the
|
|
1956
|
+
// alive, thus should not be added to the alive_wal_files_.
|
|
1630
1957
|
continue;
|
|
1631
1958
|
}
|
|
1632
1959
|
// We preallocate space for wals, but then after a crash and restart, those
|
|
1633
1960
|
// preallocated space are not needed anymore. It is likely only the last
|
|
1634
1961
|
// log has such preallocated space, so we only truncate for the last log.
|
|
1635
|
-
|
|
1962
|
+
WalFileNumberSize log;
|
|
1636
1963
|
s = GetLogSizeAndMaybeTruncate(
|
|
1637
1964
|
wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
|
|
1638
1965
|
if (!s.ok()) {
|
|
1639
1966
|
break;
|
|
1640
1967
|
}
|
|
1641
|
-
|
|
1642
|
-
|
|
1968
|
+
wals_total_size_.FetchAddRelaxed(log.size);
|
|
1969
|
+
alive_wal_files_.push_back(log);
|
|
1643
1970
|
}
|
|
1644
1971
|
return s;
|
|
1645
1972
|
}
|
|
@@ -1672,7 +1999,10 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1672
1999
|
assert(ucmp);
|
|
1673
2000
|
const size_t ts_sz = ucmp->timestamp_size();
|
|
1674
2001
|
const bool logical_strip_timestamp =
|
|
1675
|
-
ts_sz > 0 && !cfd->ioptions()
|
|
2002
|
+
ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps;
|
|
2003
|
+
// Note that here we treat flush as level 0 compaction in internal stats
|
|
2004
|
+
InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
|
|
2005
|
+
1 /* count */);
|
|
1676
2006
|
{
|
|
1677
2007
|
ScopedArenaPtr<InternalIterator> iter(
|
|
1678
2008
|
logical_strip_timestamp
|
|
@@ -1688,10 +2018,10 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1688
2018
|
cfd->GetName().c_str(), meta.fd.GetNumber());
|
|
1689
2019
|
|
|
1690
2020
|
// Get the latest mutable cf options while the mutex is still locked
|
|
1691
|
-
const MutableCFOptions
|
|
1692
|
-
|
|
2021
|
+
const MutableCFOptions mutable_cf_options_copy =
|
|
2022
|
+
cfd->GetLatestMutableCFOptions();
|
|
1693
2023
|
bool paranoid_file_checks =
|
|
1694
|
-
cfd->GetLatestMutableCFOptions()
|
|
2024
|
+
cfd->GetLatestMutableCFOptions().paranoid_file_checks;
|
|
1695
2025
|
|
|
1696
2026
|
int64_t _current_time = 0;
|
|
1697
2027
|
immutable_db_options_.clock->GetCurrentTime(&_current_time)
|
|
@@ -1700,8 +2030,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1700
2030
|
meta.oldest_ancester_time = current_time;
|
|
1701
2031
|
meta.epoch_number = cfd->NewEpochNumber();
|
|
1702
2032
|
{
|
|
1703
|
-
auto write_hint =
|
|
1704
|
-
|
|
2033
|
+
auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
|
|
2034
|
+
/*level=*/0,
|
|
2035
|
+
immutable_db_options_.calculate_sst_write_lifetime_hint_set);
|
|
1705
2036
|
mutex_.Unlock();
|
|
1706
2037
|
|
|
1707
2038
|
SequenceNumber earliest_write_conflict_snapshot;
|
|
@@ -1733,30 +2064,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1733
2064
|
const WriteOptions write_option(Env::IO_HIGH, Env::IOActivity::kDBOpen);
|
|
1734
2065
|
|
|
1735
2066
|
TableBuilderOptions tboptions(
|
|
1736
|
-
|
|
2067
|
+
cfd->ioptions(), mutable_cf_options_copy, read_option, write_option,
|
|
1737
2068
|
cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
|
|
1738
|
-
GetCompressionFlush(
|
|
1739
|
-
|
|
1740
|
-
0 /* level */, current_time /* newest_key_time */,
|
|
2069
|
+
GetCompressionFlush(cfd->ioptions(), mutable_cf_options_copy),
|
|
2070
|
+
mutable_cf_options_copy.compression_opts, cfd->GetID(),
|
|
2071
|
+
cfd->GetName(), 0 /* level */, current_time /* newest_key_time */,
|
|
1741
2072
|
false /* is_bottommost */, TableFileCreationReason::kRecovery,
|
|
1742
2073
|
0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
|
|
1743
2074
|
db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
|
|
1744
2075
|
kMaxSequenceNumber);
|
|
1745
2076
|
Version* version = cfd->current();
|
|
1746
2077
|
version->Ref();
|
|
1747
|
-
|
|
1748
|
-
s = BuildTable(
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
2078
|
+
TableProperties temp_table_proerties;
|
|
2079
|
+
s = BuildTable(
|
|
2080
|
+
dbname_, versions_.get(), immutable_db_options_, tboptions,
|
|
2081
|
+
file_options_for_compaction_, cfd->table_cache(), iter.get(),
|
|
2082
|
+
std::move(range_del_iters), &meta, &blob_file_additions,
|
|
2083
|
+
snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot,
|
|
2084
|
+
kMaxSequenceNumber, snapshot_checker, paranoid_file_checks,
|
|
2085
|
+
cfd->internal_stats(), &io_s, io_tracer_,
|
|
2086
|
+
BlobFileCreationReason::kRecovery,
|
|
2087
|
+
nullptr /* seqno_to_time_mapping */, &event_logger_, job_id,
|
|
2088
|
+
&temp_table_proerties /* table_properties */, write_hint,
|
|
2089
|
+
nullptr /*full_history_ts_low*/, &blob_callback_, version,
|
|
2090
|
+
nullptr /* memtable_payload_bytes */,
|
|
2091
|
+
nullptr /* memtable_garbage_bytes */, &flush_stats);
|
|
1760
2092
|
version->Unref();
|
|
1761
2093
|
LogFlush(immutable_db_options_.info_log);
|
|
1762
2094
|
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
|
|
@@ -1772,10 +2104,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1772
2104
|
}
|
|
1773
2105
|
|
|
1774
2106
|
uint64_t total_num_entries = mem->NumEntries();
|
|
1775
|
-
if (s.ok() && total_num_entries !=
|
|
2107
|
+
if (s.ok() && total_num_entries != flush_stats.num_input_records) {
|
|
1776
2108
|
std::string msg = "Expected " + std::to_string(total_num_entries) +
|
|
1777
2109
|
" entries in memtable, but read " +
|
|
1778
|
-
std::to_string(
|
|
2110
|
+
std::to_string(flush_stats.num_input_records);
|
|
2111
|
+
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
2112
|
+
"[%s] [JOB %d] Level-0 flush during recover: %s",
|
|
2113
|
+
cfd->GetName().c_str(), job_id, msg.c_str());
|
|
2114
|
+
if (immutable_db_options_.flush_verify_memtable_count) {
|
|
2115
|
+
s = Status::Corruption(msg);
|
|
2116
|
+
}
|
|
2117
|
+
}
|
|
2118
|
+
// Only verify on table with format collects table properties
|
|
2119
|
+
const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
2120
|
+
if (s.ok() &&
|
|
2121
|
+
(mutable_cf_options.table_factory->IsInstanceOf(
|
|
2122
|
+
TableFactory::kBlockBasedTableName()) ||
|
|
2123
|
+
mutable_cf_options.table_factory->IsInstanceOf(
|
|
2124
|
+
TableFactory::kPlainTableName())) &&
|
|
2125
|
+
flush_stats.num_output_records != temp_table_proerties.num_entries) {
|
|
2126
|
+
std::string msg =
|
|
2127
|
+
"Number of keys in flush output SST files does not match "
|
|
2128
|
+
"number of keys added to the table. Expected " +
|
|
2129
|
+
std::to_string(flush_stats.num_output_records) + " but there are " +
|
|
2130
|
+
std::to_string(temp_table_proerties.num_entries) +
|
|
2131
|
+
" in output SST files";
|
|
1779
2132
|
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
1780
2133
|
"[%s] [JOB %d] Level-0 flush during recover: %s",
|
|
1781
2134
|
cfd->GetName().c_str(), job_id, msg.c_str());
|
|
@@ -1823,30 +2176,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
1823
2176
|
}
|
|
1824
2177
|
}
|
|
1825
2178
|
|
|
1826
|
-
|
|
1827
|
-
stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
|
|
2179
|
+
flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
|
|
1828
2180
|
|
|
1829
2181
|
if (has_output) {
|
|
1830
|
-
|
|
1831
|
-
|
|
2182
|
+
flush_stats.bytes_written = meta.fd.GetFileSize();
|
|
2183
|
+
flush_stats.num_output_files = 1;
|
|
1832
2184
|
}
|
|
1833
2185
|
|
|
1834
2186
|
const auto& blobs = edit->GetBlobFileAdditions();
|
|
1835
2187
|
for (const auto& blob : blobs) {
|
|
1836
|
-
|
|
2188
|
+
flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
|
|
1837
2189
|
}
|
|
1838
2190
|
|
|
1839
|
-
|
|
2191
|
+
flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
|
|
1840
2192
|
|
|
1841
|
-
cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER,
|
|
2193
|
+
cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER,
|
|
2194
|
+
flush_stats);
|
|
1842
2195
|
cfd->internal_stats()->AddCFStats(
|
|
1843
2196
|
InternalStats::BYTES_FLUSHED,
|
|
1844
|
-
|
|
2197
|
+
flush_stats.bytes_written + flush_stats.bytes_written_blob);
|
|
1845
2198
|
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
|
|
1846
2199
|
return s;
|
|
1847
2200
|
}
|
|
1848
2201
|
|
|
1849
|
-
Status DB::Open(const Options& options, const std::string& dbname,
|
|
2202
|
+
Status DB::Open(const Options& options, const std::string& dbname,
|
|
2203
|
+
std::unique_ptr<DB>* dbptr) {
|
|
1850
2204
|
DBOptions db_options(options);
|
|
1851
2205
|
ColumnFamilyOptions cf_options(options);
|
|
1852
2206
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
@@ -1874,7 +2228,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
|
|
|
1874
2228
|
|
|
1875
2229
|
Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
|
1876
2230
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
1877
|
-
std::vector<ColumnFamilyHandle*>* handles,
|
|
2231
|
+
std::vector<ColumnFamilyHandle*>* handles,
|
|
2232
|
+
std::unique_ptr<DB>* dbptr) {
|
|
1878
2233
|
const bool kSeqPerBatch = true;
|
|
1879
2234
|
const bool kBatchPerTxn = true;
|
|
1880
2235
|
ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking);
|
|
@@ -1896,7 +2251,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
1896
2251
|
Status DB::OpenAndTrimHistory(
|
|
1897
2252
|
const DBOptions& db_options, const std::string& dbname,
|
|
1898
2253
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
1899
|
-
std::vector<ColumnFamilyHandle*>* handles, DB
|
|
2254
|
+
std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
|
|
1900
2255
|
std::string trim_ts) {
|
|
1901
2256
|
assert(dbptr != nullptr);
|
|
1902
2257
|
assert(handles != nullptr);
|
|
@@ -1951,13 +2306,14 @@ Status DB::OpenAndTrimHistory(
|
|
|
1951
2306
|
return s;
|
|
1952
2307
|
}
|
|
1953
2308
|
|
|
1954
|
-
|
|
2309
|
+
dbptr->reset(db);
|
|
1955
2310
|
return s;
|
|
1956
2311
|
}
|
|
1957
2312
|
|
|
1958
2313
|
IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
|
|
1959
2314
|
uint64_t log_file_num, uint64_t recycle_log_number,
|
|
1960
2315
|
size_t preallocate_block_size,
|
|
2316
|
+
const PredecessorWALInfo& predecessor_wal_info,
|
|
1961
2317
|
log::Writer** new_log) {
|
|
1962
2318
|
IOStatus io_s;
|
|
1963
2319
|
std::unique_ptr<FSWritableFile> lfile;
|
|
@@ -1966,6 +2322,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
|
|
|
1966
2322
|
BuildDBOptions(immutable_db_options_, mutable_db_options_);
|
|
1967
2323
|
FileOptions opt_file_options =
|
|
1968
2324
|
fs_->OptimizeForLogWrite(file_options_, db_options);
|
|
2325
|
+
opt_file_options.write_hint = CalculateWALWriteHint();
|
|
1969
2326
|
// DB option takes precedence when not kUnknown
|
|
1970
2327
|
if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
|
|
1971
2328
|
opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
|
|
@@ -1987,7 +2344,9 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
|
|
|
1987
2344
|
}
|
|
1988
2345
|
|
|
1989
2346
|
if (io_s.ok()) {
|
|
1990
|
-
|
|
2347
|
+
// Subsequent attempts to override the hint via SetWriteLifeTimeHint
|
|
2348
|
+
// with the very same value will be ignored by the fs.
|
|
2349
|
+
lfile->SetWriteLifeTimeHint(opt_file_options.write_hint);
|
|
1991
2350
|
lfile->SetPreallocationBlockSize(preallocate_block_size);
|
|
1992
2351
|
|
|
1993
2352
|
const auto& listeners = immutable_db_options_.listeners;
|
|
@@ -2001,9 +2360,15 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
|
|
|
2001
2360
|
*new_log = new log::Writer(std::move(file_writer), log_file_num,
|
|
2002
2361
|
immutable_db_options_.recycle_log_file_num > 0,
|
|
2003
2362
|
immutable_db_options_.manual_wal_flush,
|
|
2004
|
-
immutable_db_options_.wal_compression
|
|
2363
|
+
immutable_db_options_.wal_compression,
|
|
2364
|
+
immutable_db_options_.track_and_verify_wals);
|
|
2005
2365
|
io_s = (*new_log)->AddCompressionTypeRecord(write_options);
|
|
2366
|
+
if (io_s.ok()) {
|
|
2367
|
+
io_s = (*new_log)->MaybeAddPredecessorWALInfo(write_options,
|
|
2368
|
+
predecessor_wal_info);
|
|
2369
|
+
}
|
|
2006
2370
|
}
|
|
2371
|
+
|
|
2007
2372
|
return io_s;
|
|
2008
2373
|
}
|
|
2009
2374
|
|
|
@@ -2014,9 +2379,10 @@ void DBImpl::TrackExistingDataFiles(
|
|
|
2014
2379
|
|
|
2015
2380
|
Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
2016
2381
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
2017
|
-
std::vector<ColumnFamilyHandle*>* handles,
|
|
2018
|
-
|
|
2019
|
-
const bool
|
|
2382
|
+
std::vector<ColumnFamilyHandle*>* handles,
|
|
2383
|
+
std::unique_ptr<DB>* dbptr, const bool seq_per_batch,
|
|
2384
|
+
const bool batch_per_txn, const bool is_retry,
|
|
2385
|
+
bool* can_retry) {
|
|
2020
2386
|
const WriteOptions write_options(Env::IOActivity::kDBOpen);
|
|
2021
2387
|
const ReadOptions read_options(Env::IOActivity::kDBOpen);
|
|
2022
2388
|
|
|
@@ -2035,15 +2401,17 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2035
2401
|
handles->clear();
|
|
2036
2402
|
|
|
2037
2403
|
size_t max_write_buffer_size = 0;
|
|
2404
|
+
MinAndMaxPreserveSeconds preserve_info;
|
|
2038
2405
|
for (const auto& cf : column_families) {
|
|
2039
2406
|
max_write_buffer_size =
|
|
2040
2407
|
std::max(max_write_buffer_size, cf.options.write_buffer_size);
|
|
2408
|
+
preserve_info.Combine(cf.options);
|
|
2041
2409
|
}
|
|
2042
2410
|
|
|
2043
|
-
|
|
2411
|
+
auto impl = std::make_unique<DBImpl>(db_options, dbname, seq_per_batch,
|
|
2412
|
+
batch_per_txn);
|
|
2044
2413
|
if (!impl->immutable_db_options_.info_log) {
|
|
2045
2414
|
s = impl->init_logger_creation_s_;
|
|
2046
|
-
delete impl;
|
|
2047
2415
|
return s;
|
|
2048
2416
|
} else {
|
|
2049
2417
|
assert(impl->init_logger_creation_s_.ok());
|
|
@@ -2076,7 +2444,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2076
2444
|
s = impl->CreateArchivalDirectory();
|
|
2077
2445
|
}
|
|
2078
2446
|
if (!s.ok()) {
|
|
2079
|
-
delete impl;
|
|
2080
2447
|
return s;
|
|
2081
2448
|
}
|
|
2082
2449
|
|
|
@@ -2096,23 +2463,29 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2096
2463
|
log::Writer* new_log = nullptr;
|
|
2097
2464
|
const size_t preallocate_block_size =
|
|
2098
2465
|
impl->GetWalPreallocateBlockSize(max_write_buffer_size);
|
|
2466
|
+
// TODO(hx235): Pass in the correct `predecessor_wal_info` for the first WAL
|
|
2467
|
+
// created during DB open with predecessor WALs from previous DB session due
|
|
2468
|
+
// to `avoid_flush_during_recovery == true`. This can protect the last WAL
|
|
2469
|
+
// recovered.
|
|
2099
2470
|
s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/,
|
|
2100
|
-
preallocate_block_size,
|
|
2471
|
+
preallocate_block_size,
|
|
2472
|
+
PredecessorWALInfo() /* predecessor_wal_info */,
|
|
2473
|
+
&new_log);
|
|
2101
2474
|
if (s.ok()) {
|
|
2102
2475
|
// Prevent log files created by previous instance from being recycled.
|
|
2103
2476
|
// They might be in alive_log_file_, and might get recycled otherwise.
|
|
2104
|
-
impl->
|
|
2477
|
+
impl->min_wal_number_to_recycle_ = new_log_number;
|
|
2105
2478
|
}
|
|
2106
2479
|
if (s.ok()) {
|
|
2107
|
-
InstrumentedMutexLock wl(&impl->
|
|
2108
|
-
impl->
|
|
2480
|
+
InstrumentedMutexLock wl(&impl->wal_write_mutex_);
|
|
2481
|
+
impl->cur_wal_number_ = new_log_number;
|
|
2109
2482
|
assert(new_log != nullptr);
|
|
2110
2483
|
assert(impl->logs_.empty());
|
|
2111
2484
|
impl->logs_.emplace_back(new_log_number, new_log);
|
|
2112
2485
|
}
|
|
2113
2486
|
|
|
2114
2487
|
if (s.ok()) {
|
|
2115
|
-
impl->
|
|
2488
|
+
impl->alive_wal_files_.emplace_back(impl->cur_wal_number_);
|
|
2116
2489
|
// In WritePrepared there could be gap in sequence numbers. This breaks
|
|
2117
2490
|
// the trick we use in kPointInTimeRecovery which assumes the first seq in
|
|
2118
2491
|
// the log right after the corrupted log is one larger than the last seq
|
|
@@ -2125,14 +2498,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2125
2498
|
if (recovered_seq != kMaxSequenceNumber) {
|
|
2126
2499
|
WriteBatch empty_batch;
|
|
2127
2500
|
WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
|
|
2128
|
-
uint64_t
|
|
2501
|
+
uint64_t wal_used, log_size;
|
|
2129
2502
|
log::Writer* log_writer = impl->logs_.back().writer;
|
|
2130
|
-
|
|
2503
|
+
WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back();
|
|
2131
2504
|
|
|
2132
|
-
assert(log_writer->get_log_number() ==
|
|
2505
|
+
assert(log_writer->get_log_number() == wal_file_number_size.number);
|
|
2133
2506
|
impl->mutex_.AssertHeld();
|
|
2134
|
-
s = impl->WriteToWAL(empty_batch, write_options, log_writer, &
|
|
2135
|
-
&log_size,
|
|
2507
|
+
s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used,
|
|
2508
|
+
&log_size, wal_file_number_size, recovered_seq);
|
|
2136
2509
|
if (s.ok()) {
|
|
2137
2510
|
// Need to fsync, otherwise it might get lost after a power reset.
|
|
2138
2511
|
s = impl->FlushWAL(write_options, false);
|
|
@@ -2165,6 +2538,12 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2165
2538
|
s = impl->InitPersistStatsColumnFamily();
|
|
2166
2539
|
}
|
|
2167
2540
|
|
|
2541
|
+
// After reaching the post-recovery seqno but before creating SuperVersions
|
|
2542
|
+
// ensure seqno to time mapping is pre-populated as needed.
|
|
2543
|
+
if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) {
|
|
2544
|
+
impl->PrepopulateSeqnoToTimeMapping(preserve_info);
|
|
2545
|
+
}
|
|
2546
|
+
|
|
2168
2547
|
if (s.ok()) {
|
|
2169
2548
|
// set column family handles
|
|
2170
2549
|
for (const auto& cf : column_families) {
|
|
@@ -2172,8 +2551,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2172
2551
|
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
|
2173
2552
|
if (cfd != nullptr) {
|
|
2174
2553
|
handles->push_back(
|
|
2175
|
-
new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
|
2554
|
+
new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_));
|
|
2176
2555
|
impl->NewThreadStatusCfInfo(cfd);
|
|
2556
|
+
SuperVersionContext sv_context(/* create_superversion */ true);
|
|
2557
|
+
impl->InstallSuperVersionForConfigChange(cfd, &sv_context);
|
|
2558
|
+
sv_context.Clean();
|
|
2177
2559
|
} else {
|
|
2178
2560
|
if (db_options.create_missing_column_families) {
|
|
2179
2561
|
// missing column family, create it
|
|
@@ -2181,6 +2563,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2181
2563
|
impl->mutex_.Unlock();
|
|
2182
2564
|
// NOTE: the work normally done in WrapUpCreateColumnFamilies will
|
|
2183
2565
|
// be done separately below.
|
|
2566
|
+
// This includes InstallSuperVersionForConfigChange.
|
|
2184
2567
|
s = impl->CreateColumnFamilyImpl(read_options, write_options,
|
|
2185
2568
|
cf.options, cf.name, &handle);
|
|
2186
2569
|
impl->mutex_.Lock();
|
|
@@ -2197,16 +2580,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2197
2580
|
}
|
|
2198
2581
|
}
|
|
2199
2582
|
|
|
2200
|
-
if (s.ok()) {
|
|
2583
|
+
if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
|
|
2584
|
+
// Install SuperVersion for hidden column family
|
|
2585
|
+
assert(impl->persist_stats_cf_handle_);
|
|
2586
|
+
assert(impl->persist_stats_cf_handle_->cfd());
|
|
2201
2587
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
|
|
2205
|
-
}
|
|
2588
|
+
impl->InstallSuperVersionForConfigChange(
|
|
2589
|
+
impl->persist_stats_cf_handle_->cfd(), &sv_context);
|
|
2206
2590
|
sv_context.Clean();
|
|
2207
|
-
}
|
|
2208
|
-
|
|
2209
|
-
if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
|
|
2210
2591
|
// try to read format version
|
|
2211
2592
|
s = impl->PersistentStatsProcessFormatVersion();
|
|
2212
2593
|
}
|
|
@@ -2216,7 +2597,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2216
2597
|
if (!cfd->mem()->IsSnapshotSupported()) {
|
|
2217
2598
|
impl->is_snapshot_supported_ = false;
|
|
2218
2599
|
}
|
|
2219
|
-
if (cfd->ioptions()
|
|
2600
|
+
if (cfd->ioptions().merge_operator != nullptr &&
|
|
2220
2601
|
!cfd->mem()->IsMergeOperatorSupported()) {
|
|
2221
2602
|
s = Status::InvalidArgument(
|
|
2222
2603
|
"The memtable of column family %s does not support merge operator "
|
|
@@ -2235,7 +2616,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2235
2616
|
// The WriteOptionsFile() will release and lock the mutex internally.
|
|
2236
2617
|
persist_options_status =
|
|
2237
2618
|
impl->WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
|
|
2238
|
-
*dbptr = impl;
|
|
2239
2619
|
impl->opened_successfully_ = true;
|
|
2240
2620
|
} else {
|
|
2241
2621
|
persist_options_status.PermitUncheckedError();
|
|
@@ -2286,7 +2666,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2286
2666
|
|
|
2287
2667
|
if (s.ok()) {
|
|
2288
2668
|
ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
|
|
2289
|
-
impl);
|
|
2669
|
+
impl.get());
|
|
2290
2670
|
LogFlush(impl->immutable_db_options_.info_log);
|
|
2291
2671
|
if (!impl->WALBufferIsEmpty()) {
|
|
2292
2672
|
s = impl->FlushWAL(write_options, false);
|
|
@@ -2316,17 +2696,16 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
2316
2696
|
s = impl->StartPeriodicTaskScheduler();
|
|
2317
2697
|
}
|
|
2318
2698
|
if (s.ok()) {
|
|
2319
|
-
s = impl->RegisterRecordSeqnoTimeWorker(
|
|
2320
|
-
recovery_ctx.is_new_db_);
|
|
2699
|
+
s = impl->RegisterRecordSeqnoTimeWorker();
|
|
2321
2700
|
}
|
|
2322
2701
|
impl->options_mutex_.Unlock();
|
|
2323
|
-
if (
|
|
2702
|
+
if (s.ok()) {
|
|
2703
|
+
*dbptr = std::move(impl);
|
|
2704
|
+
} else {
|
|
2324
2705
|
for (auto* h : *handles) {
|
|
2325
2706
|
delete h;
|
|
2326
2707
|
}
|
|
2327
2708
|
handles->clear();
|
|
2328
|
-
delete impl;
|
|
2329
|
-
*dbptr = nullptr;
|
|
2330
2709
|
}
|
|
2331
2710
|
return s;
|
|
2332
2711
|
}
|