@nxtedition/rocksdb 13.5.7 → 13.5.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +248 -70
- package/binding.gyp +2 -2
- package/deps/rocksdb/rocksdb/BUCK +12 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -0
- package/deps/rocksdb/rocksdb/Makefile +28 -23
- package/deps/rocksdb/rocksdb/cache/cache.cc +0 -1
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +43 -39
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +0 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -3
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +1 -3
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +11 -1
- package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +13 -5
- package/deps/rocksdb/rocksdb/crash_test.mk +61 -15
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +136 -45
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +34 -16
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +10 -7
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +1 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +12 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +2 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/builder.cc +22 -8
- package/deps/rocksdb/rocksdb/db/builder.h +5 -4
- package/deps/rocksdb/rocksdb/db/c.cc +556 -15
- package/deps/rocksdb/rocksdb/db/c_test.c +133 -12
- package/deps/rocksdb/rocksdb/db/column_family.cc +114 -50
- package/deps/rocksdb/rocksdb/db/column_family.h +53 -36
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +95 -70
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +71 -51
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -86
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +26 -68
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +0 -122
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +453 -258
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +117 -92
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +38 -38
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +24 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +34 -45
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -31
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +10 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +82 -34
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +267 -179
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +4 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +273 -89
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +300 -14
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +4 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +28 -23
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +69 -51
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +522 -245
- package/deps/rocksdb/rocksdb/db/convenience.cc +15 -4
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +196 -17
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +74 -62
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +48 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +682 -250
- package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +11 -16
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +57 -0
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +540 -490
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +347 -188
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +584 -217
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +13 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -7
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +40 -36
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +751 -372
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +35 -32
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +24 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +125 -63
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +311 -196
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +15 -5
- package/deps/rocksdb/rocksdb/db/db_iter.cc +42 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +96 -31
- package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/db_iter_test.cc +168 -228
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +454 -0
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +8 -8
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +90 -0
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +60 -2
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +7 -3
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +85 -27
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +114 -2
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +51 -3
- package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_test.cc +325 -18
- package/deps/rocksdb/rocksdb/db/db_test2.cc +644 -20
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
- package/deps/rocksdb/rocksdb/db/db_test_util.h +9 -0
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +64 -45
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +203 -14
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +259 -30
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +75 -1
- package/deps/rocksdb/rocksdb/db/dbformat.h +70 -6
- package/deps/rocksdb/rocksdb/db/deletefile_test.cc +0 -190
- package/deps/rocksdb/rocksdb/db/error_handler.cc +22 -7
- package/deps/rocksdb/rocksdb/db/error_handler.h +16 -1
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +41 -26
- package/deps/rocksdb/rocksdb/db/experimental.cc +4 -3
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +464 -78
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +166 -69
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +54 -25
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/flush_job.cc +98 -81
- package/deps/rocksdb/rocksdb/db/flush_job.h +4 -9
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +80 -84
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +1 -1
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +2 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +12 -19
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +41 -15
- package/deps/rocksdb/rocksdb/db/internal_stats.h +63 -52
- package/deps/rocksdb/rocksdb/db/job_context.h +59 -24
- package/deps/rocksdb/rocksdb/db/listener_test.cc +69 -10
- package/deps/rocksdb/rocksdb/db/log_format.h +11 -2
- package/deps/rocksdb/rocksdb/db/log_reader.cc +147 -34
- package/deps/rocksdb/rocksdb/db/log_reader.h +40 -11
- package/deps/rocksdb/rocksdb/db/log_test.cc +16 -3
- package/deps/rocksdb/rocksdb/db/log_writer.cc +102 -55
- package/deps/rocksdb/rocksdb/db/log_writer.h +21 -2
- package/deps/rocksdb/rocksdb/db/malloc_stats.h +0 -2
- package/deps/rocksdb/rocksdb/db/memtable.cc +16 -47
- package/deps/rocksdb/rocksdb/db/memtable.h +76 -12
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +23 -20
- package/deps/rocksdb/rocksdb/db/memtable_list.h +9 -11
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +18 -37
- package/deps/rocksdb/rocksdb/db/merge_context.h +2 -1
- package/deps/rocksdb/rocksdb/db/merge_test.cc +8 -0
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +3 -5
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +15 -7
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +6 -3
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +22 -4
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +41 -1
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/repair.cc +29 -34
- package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +14 -15
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +1 -3
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +47 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +1 -3
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_builder.cc +2 -2
- package/deps/rocksdb/rocksdb/db/version_edit.cc +8 -37
- package/deps/rocksdb/rocksdb/db/version_edit.h +32 -1
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +26 -18
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +282 -197
- package/deps/rocksdb/rocksdb/db/version_set.h +54 -57
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +28 -35
- package/deps/rocksdb/rocksdb/db/version_util.h +2 -3
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +3 -2
- package/deps/rocksdb/rocksdb/db/wal_manager.h +0 -1
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +1 -0
- package/deps/rocksdb/rocksdb/db/write_batch.cc +22 -8
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +5 -4
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +7 -6
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/write_thread.h +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +13 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +9 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +39 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +65 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +45 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +22 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +28 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -38
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +80 -32
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +51 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +23 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +305 -15
- package/deps/rocksdb/rocksdb/env/env.cc +32 -2
- package/deps/rocksdb/rocksdb/env/env_encryption.cc +0 -2
- package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +2 -4
- package/deps/rocksdb/rocksdb/env/env_posix.cc +4 -2
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -11
- package/deps/rocksdb/rocksdb/env/fs_readonly.h +0 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +0 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.h +0 -2
- package/deps/rocksdb/rocksdb/env/io_posix.cc +6 -4
- package/deps/rocksdb/rocksdb/env/io_posix.h +3 -2
- package/deps/rocksdb/rocksdb/env/mock_env.cc +0 -1
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +2 -2
- package/deps/rocksdb/rocksdb/file/delete_scheduler.h +0 -2
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +0 -2
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +30 -21
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +16 -0
- package/deps/rocksdb/rocksdb/file/file_util.cc +32 -14
- package/deps/rocksdb/rocksdb/file/file_util.h +22 -5
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +229 -76
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +21 -12
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +10 -7
- package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +12 -8
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +1 -2
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +0 -2
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +598 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_iterator.h +36 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +70 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +232 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +149 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +17 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +132 -34
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +158 -79
- package/deps/rocksdb/rocksdb/include/rocksdb/db_bench_tool.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +1 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +275 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +50 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +10 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +5 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +237 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +230 -39
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_level.h +31 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +41 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +19 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +124 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +26 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +55 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +3 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +96 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index_faiss.h +117 -0
- package/deps/rocksdb/rocksdb/{utilities/secondary_index/faiss_ivf_index.h → include/rocksdb/utilities/secondary_index_simple.h} +11 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +26 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +16 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +63 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +28 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +3 -3
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +0 -2
- package/deps/rocksdb/rocksdb/logging/event_logger_test.cc +1 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +1 -1
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +0 -1
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +0 -1
- package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +3 -1
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +2 -2
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +2 -4
- package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +69 -8
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +32 -9
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +58 -45
- package/deps/rocksdb/rocksdb/monitoring/histogram.h +1 -1
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -3
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +1 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +3 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +44 -13
- package/deps/rocksdb/rocksdb/options/cf_options.h +21 -7
- package/deps/rocksdb/rocksdb/options/configurable.cc +5 -5
- package/deps/rocksdb/rocksdb/options/configurable_test.h +1 -2
- package/deps/rocksdb/rocksdb/options/customizable.cc +0 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -11
- package/deps/rocksdb/rocksdb/options/db_options.cc +18 -15
- package/deps/rocksdb/rocksdb/options/db_options.h +2 -2
- package/deps/rocksdb/rocksdb/options/options.cc +296 -305
- package/deps/rocksdb/rocksdb/options/options_helper.cc +188 -62
- package/deps/rocksdb/rocksdb/options/options_helper.h +3 -3
- package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -4
- package/deps/rocksdb/rocksdb/options/options_parser.h +0 -1
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +17 -4
- package/deps/rocksdb/rocksdb/options/options_test.cc +101 -76
- package/deps/rocksdb/rocksdb/port/lang.h +2 -1
- package/deps/rocksdb/rocksdb/port/port_posix.cc +2 -1
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +5 -4
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +3 -2
- package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +99 -1
- package/deps/rocksdb/rocksdb/port/win/xpress_win.h +6 -0
- package/deps/rocksdb/rocksdb/src.mk +17 -11
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1094 -929
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +6 -19
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +76 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +221 -131
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +12 -9
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +23 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -38
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +10 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +35 -43
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -4
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -5
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +4 -4
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +37 -35
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +11 -7
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +4 -3
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +31 -5
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +0 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +0 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +0 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +483 -0
- package/deps/rocksdb/rocksdb/table/format.cc +62 -44
- package/deps/rocksdb/rocksdb/table/format.h +35 -12
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +3 -13
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +6 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +150 -141
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +5 -0
- package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +8 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +0 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +0 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +0 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +6 -6
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +86 -7
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +88 -2
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +0 -1
- package/deps/rocksdb/rocksdb/table/table_builder.h +10 -1
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +3 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +899 -22
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +3 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.h +132 -1
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +0 -1
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.h +0 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +163 -77
- package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +0 -2
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +120 -52
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +0 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +0 -2
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +94 -0
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +0 -1
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +1 -1
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +2 -1
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +3 -5
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +15 -8
- package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +131 -0
- package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +90 -0
- package/deps/rocksdb/rocksdb/util/autovector.h +1 -1
- package/deps/rocksdb/rocksdb/util/autovector_test.cc +2 -2
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +0 -2
- package/deps/rocksdb/rocksdb/util/compression.cc +936 -4
- package/deps/rocksdb/rocksdb/util/compression.h +348 -232
- package/deps/rocksdb/rocksdb/util/compression_test.cc +229 -0
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +10 -10
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.c +1 -0
- package/deps/rocksdb/rocksdb/util/data_structure.cc +2 -0
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -3
- package/deps/rocksdb/rocksdb/util/ppc-opcode.h +5 -5
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +108 -0
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +67 -0
- package/deps/rocksdb/rocksdb/util/slice_test.cc +83 -0
- package/deps/rocksdb/rocksdb/util/string_util.cc +0 -2
- package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
- package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
- package/deps/rocksdb/rocksdb/util/udt_util.cc +18 -5
- package/deps/rocksdb/rocksdb/util/udt_util.h +10 -7
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +650 -154
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +438 -144
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +16 -17
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +2 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +7 -8
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +0 -48
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/debug.cc +7 -14
- package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/env_mirror_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/env_timed.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/env_timed_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +10 -9
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/lrulist.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +183 -32
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +258 -12
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_helper.h +33 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_iterator.cc +99 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +280 -120
- package/deps/rocksdb/rocksdb/utilities/secondary_index/simple_secondary_index.cc +79 -0
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +52 -16
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h +10 -6
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +55 -0
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +37 -12
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +1 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +36 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +4 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +1 -4
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1118 -37
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +4 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +0 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +125 -127
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +45 -23
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +54 -22
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +477 -58
- package/deps/rocksdb/rocksdb.gyp +9 -4
- package/index.js +50 -9
- package/package.json +8 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -74,12 +74,14 @@
|
|
|
74
74
|
#include "options/cf_options.h"
|
|
75
75
|
#include "options/options_helper.h"
|
|
76
76
|
#include "options/options_parser.h"
|
|
77
|
+
#include "util/udt_util.h"
|
|
77
78
|
#ifdef ROCKSDB_JEMALLOC
|
|
78
79
|
#include "port/jemalloc_helper.h"
|
|
79
80
|
#endif
|
|
80
81
|
#include "port/port.h"
|
|
81
82
|
#include "rocksdb/cache.h"
|
|
82
83
|
#include "rocksdb/compaction_filter.h"
|
|
84
|
+
#include "rocksdb/convenience.h"
|
|
83
85
|
#include "rocksdb/db.h"
|
|
84
86
|
#include "rocksdb/env.h"
|
|
85
87
|
#include "rocksdb/merge_operator.h"
|
|
@@ -168,7 +170,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
168
170
|
bool read_only)
|
|
169
171
|
: dbname_(dbname),
|
|
170
172
|
own_info_log_(options.info_log == nullptr),
|
|
171
|
-
init_logger_creation_s_(),
|
|
172
173
|
initial_db_options_(SanitizeOptions(dbname, options, read_only,
|
|
173
174
|
&init_logger_creation_s_)),
|
|
174
175
|
env_(initial_db_options_.env),
|
|
@@ -184,7 +185,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
184
185
|
mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
|
|
185
186
|
immutable_db_options_.use_adaptive_mutex),
|
|
186
187
|
#endif // COERCE_CONTEXT_SWITCH
|
|
187
|
-
default_cf_handle_(nullptr),
|
|
188
188
|
error_handler_(this, immutable_db_options_, &mutex_),
|
|
189
189
|
event_logger_(immutable_db_options_.info_log.get()),
|
|
190
190
|
max_total_in_memory_state_(0),
|
|
@@ -193,45 +193,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
193
193
|
file_options_, immutable_db_options_)),
|
|
194
194
|
seq_per_batch_(seq_per_batch),
|
|
195
195
|
batch_per_txn_(batch_per_txn),
|
|
196
|
-
next_job_id_(1),
|
|
197
|
-
shutting_down_(false),
|
|
198
|
-
reject_new_background_jobs_(false),
|
|
199
|
-
db_lock_(nullptr),
|
|
200
|
-
manual_compaction_paused_(false),
|
|
201
196
|
bg_cv_(&mutex_),
|
|
202
|
-
|
|
203
|
-
log_dir_synced_(false),
|
|
204
|
-
log_empty_(true),
|
|
205
|
-
persist_stats_cf_handle_(nullptr),
|
|
206
|
-
log_sync_cv_(&log_write_mutex_),
|
|
207
|
-
total_log_size_(0),
|
|
208
|
-
is_snapshot_supported_(true),
|
|
197
|
+
wal_sync_cv_(&wal_write_mutex_),
|
|
209
198
|
write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
|
|
210
199
|
write_thread_(immutable_db_options_),
|
|
211
200
|
nonmem_write_thread_(immutable_db_options_),
|
|
212
201
|
write_controller_(mutable_db_options_.delayed_write_rate),
|
|
213
|
-
last_batch_group_size_(0),
|
|
214
|
-
unscheduled_flushes_(0),
|
|
215
|
-
unscheduled_compactions_(0),
|
|
216
|
-
bg_bottom_compaction_scheduled_(0),
|
|
217
|
-
bg_compaction_scheduled_(0),
|
|
218
|
-
num_running_compactions_(0),
|
|
219
|
-
bg_flush_scheduled_(0),
|
|
220
|
-
num_running_flushes_(0),
|
|
221
|
-
bg_purge_scheduled_(0),
|
|
222
|
-
disable_delete_obsolete_files_(0),
|
|
223
|
-
pending_purge_obsolete_files_(0),
|
|
224
202
|
delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
|
|
225
|
-
has_unpersisted_data_(false),
|
|
226
|
-
unable_to_release_oldest_log_(false),
|
|
227
|
-
num_running_ingest_file_(0),
|
|
228
203
|
wal_manager_(immutable_db_options_, file_options_, io_tracer_,
|
|
229
204
|
seq_per_batch),
|
|
230
|
-
bg_work_paused_(0),
|
|
231
|
-
bg_compaction_paused_(0),
|
|
232
|
-
refitting_level_(false),
|
|
233
|
-
opened_successfully_(false),
|
|
234
|
-
periodic_task_scheduler_(),
|
|
235
205
|
two_write_queues_(options.two_write_queues),
|
|
236
206
|
manual_wal_flush_(options.manual_wal_flush),
|
|
237
207
|
// last_sequencee_ is always maintained by the main queue that also writes
|
|
@@ -249,14 +219,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
249
219
|
// requires a custom gc for compaction, we use that to set use_custom_gc_
|
|
250
220
|
// as well.
|
|
251
221
|
use_custom_gc_(seq_per_batch),
|
|
252
|
-
shutdown_initiated_(false),
|
|
253
222
|
own_sfm_(options.sst_file_manager == nullptr),
|
|
254
|
-
closed_(false),
|
|
255
223
|
atomic_flush_install_cv_(&mutex_),
|
|
256
224
|
blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
|
|
257
225
|
&error_handler_, &event_logger_,
|
|
258
|
-
immutable_db_options_.listeners, dbname_)
|
|
259
|
-
lock_wal_count_(0) {
|
|
226
|
+
immutable_db_options_.listeners, dbname_) {
|
|
260
227
|
// !batch_per_trx_ implies seq_per_batch_ because it is only unset for
|
|
261
228
|
// WriteUnprepared, which should use seq_per_batch_.
|
|
262
229
|
assert(batch_per_txn_ || seq_per_batch_);
|
|
@@ -284,9 +251,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
284
251
|
periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
|
|
285
252
|
[this]() { this->FlushInfoLog(); });
|
|
286
253
|
periodic_task_functions_.emplace(
|
|
287
|
-
PeriodicTaskType::kRecordSeqnoTime,
|
|
288
|
-
|
|
289
|
-
|
|
254
|
+
PeriodicTaskType::kRecordSeqnoTime,
|
|
255
|
+
[this]() { this->RecordSeqnoToTimeMapping(); });
|
|
256
|
+
periodic_task_functions_.emplace(
|
|
257
|
+
PeriodicTaskType::kTriggerCompaction,
|
|
258
|
+
[this]() { this->TriggerPeriodicCompaction(); });
|
|
290
259
|
|
|
291
260
|
versions_.reset(new VersionSet(
|
|
292
261
|
dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
|
|
@@ -386,9 +355,8 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
|
|
386
355
|
static_cast_with_check<ColumnFamilyHandleImpl>(default_cf_handle_);
|
|
387
356
|
assert(cfh);
|
|
388
357
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
&edit, &mutex_, directories_.GetDbDir());
|
|
358
|
+
s = versions_->LogAndApply(cfd, read_options, write_options, &edit,
|
|
359
|
+
&mutex_, directories_.GetDbDir());
|
|
392
360
|
if (!s.ok()) {
|
|
393
361
|
io_s = versions_->io_status();
|
|
394
362
|
if (!io_s.ok()) {
|
|
@@ -418,26 +386,26 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
|
|
418
386
|
}
|
|
419
387
|
}
|
|
420
388
|
|
|
421
|
-
if (s.ok()) {
|
|
422
|
-
// This will notify and unblock threads waiting for error recovery to
|
|
423
|
-
// finish. Those previouly waiting threads can now proceed, which may
|
|
424
|
-
// include closing the db.
|
|
425
|
-
s = error_handler_.ClearBGError();
|
|
426
|
-
} else {
|
|
427
|
-
// NOTE: this is needed to pass ASSERT_STATUS_CHECKED
|
|
428
|
-
// in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
|
|
429
|
-
// See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
|
|
430
|
-
error_handler_.GetRecoveryError().PermitUncheckedError();
|
|
431
|
-
}
|
|
432
|
-
|
|
433
389
|
JobContext job_context(0);
|
|
434
390
|
FindObsoleteFiles(&job_context, true);
|
|
435
391
|
mutex_.Unlock();
|
|
392
|
+
// If DB shutdown initiated here, it will wait for this ongoing recovery.
|
|
436
393
|
job_context.manifest_file_number = 1;
|
|
437
394
|
if (job_context.HaveSomethingToDelete()) {
|
|
438
395
|
PurgeObsoleteFiles(job_context);
|
|
439
396
|
}
|
|
440
397
|
job_context.Clean();
|
|
398
|
+
mutex_.Lock();
|
|
399
|
+
|
|
400
|
+
if (s.ok()) {
|
|
401
|
+
// Will notify and unblock threads waiting for error recovery to finish.
|
|
402
|
+
s = error_handler_.ClearBGError();
|
|
403
|
+
} else {
|
|
404
|
+
// NOTE: this is needed to pass ASSERT_STATUS_CHECKED
|
|
405
|
+
// in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
|
|
406
|
+
// See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
|
|
407
|
+
error_handler_.GetRecoveryError().PermitUncheckedError();
|
|
408
|
+
}
|
|
441
409
|
|
|
442
410
|
if (s.ok()) {
|
|
443
411
|
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
|
|
@@ -446,7 +414,6 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
|
|
446
414
|
s.ToString().c_str());
|
|
447
415
|
}
|
|
448
416
|
|
|
449
|
-
mutex_.Lock();
|
|
450
417
|
// Check for shutdown again before scheduling further compactions,
|
|
451
418
|
// since we released and re-acquired the lock above
|
|
452
419
|
if (shutdown_initiated_) {
|
|
@@ -509,6 +476,11 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
|
|
|
509
476
|
s.PermitUncheckedError(); //**TODO: What to do on error?
|
|
510
477
|
}
|
|
511
478
|
|
|
479
|
+
// Cancel awaiting remote compactions
|
|
480
|
+
if (immutable_db_options_.compaction_service) {
|
|
481
|
+
immutable_db_options_.compaction_service->CancelAwaitingJobs();
|
|
482
|
+
}
|
|
483
|
+
|
|
512
484
|
shutting_down_.store(true, std::memory_order_release);
|
|
513
485
|
bg_cv_.SignalAll();
|
|
514
486
|
if (!wait) {
|
|
@@ -540,8 +512,8 @@ Status DBImpl::CloseHelper() {
|
|
|
540
512
|
// continuing with the shutdown
|
|
541
513
|
mutex_.Lock();
|
|
542
514
|
shutdown_initiated_ = true;
|
|
543
|
-
error_handler_.
|
|
544
|
-
while (error_handler_.
|
|
515
|
+
error_handler_.CancelErrorRecoveryForShutDown();
|
|
516
|
+
while (!error_handler_.ReadyForShutdown()) {
|
|
545
517
|
bg_cv_.Wait();
|
|
546
518
|
}
|
|
547
519
|
mutex_.Unlock();
|
|
@@ -633,8 +605,8 @@ Status DBImpl::CloseHelper() {
|
|
|
633
605
|
mutex_.Lock();
|
|
634
606
|
}
|
|
635
607
|
{
|
|
636
|
-
InstrumentedMutexLock lock(&
|
|
637
|
-
for (auto l :
|
|
608
|
+
InstrumentedMutexLock lock(&wal_write_mutex_);
|
|
609
|
+
for (auto l : wals_to_free_) {
|
|
638
610
|
delete l;
|
|
639
611
|
}
|
|
640
612
|
for (auto& log : logs_) {
|
|
@@ -818,7 +790,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
|
|
|
818
790
|
Status s = periodic_task_scheduler_.Register(
|
|
819
791
|
PeriodicTaskType::kDumpStats,
|
|
820
792
|
periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
|
|
821
|
-
mutable_db_options_.stats_dump_period_sec
|
|
793
|
+
mutable_db_options_.stats_dump_period_sec,
|
|
794
|
+
/*run_immediately=*/true);
|
|
822
795
|
if (!s.ok()) {
|
|
823
796
|
return s;
|
|
824
797
|
}
|
|
@@ -827,7 +800,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
|
|
|
827
800
|
Status s = periodic_task_scheduler_.Register(
|
|
828
801
|
PeriodicTaskType::kPersistStats,
|
|
829
802
|
periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
|
|
830
|
-
mutable_db_options_.stats_persist_period_sec
|
|
803
|
+
mutable_db_options_.stats_persist_period_sec,
|
|
804
|
+
/*run_immediately=*/true);
|
|
831
805
|
if (!s.ok()) {
|
|
832
806
|
return s;
|
|
833
807
|
}
|
|
@@ -835,64 +809,55 @@ Status DBImpl::StartPeriodicTaskScheduler() {
|
|
|
835
809
|
|
|
836
810
|
Status s = periodic_task_scheduler_.Register(
|
|
837
811
|
PeriodicTaskType::kFlushInfoLog,
|
|
838
|
-
periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog)
|
|
812
|
+
periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog),
|
|
813
|
+
/*run_immediately=*/true);
|
|
814
|
+
|
|
815
|
+
if (s.ok()) {
|
|
816
|
+
s = periodic_task_scheduler_.Register(
|
|
817
|
+
PeriodicTaskType::kTriggerCompaction,
|
|
818
|
+
periodic_task_functions_.at(PeriodicTaskType::kTriggerCompaction),
|
|
819
|
+
/*run_immediately=*/false);
|
|
820
|
+
}
|
|
839
821
|
|
|
840
822
|
return s;
|
|
841
823
|
}
|
|
842
824
|
|
|
843
|
-
Status DBImpl::RegisterRecordSeqnoTimeWorker(
|
|
844
|
-
const WriteOptions& write_options,
|
|
845
|
-
bool is_new_db) {
|
|
825
|
+
Status DBImpl::RegisterRecordSeqnoTimeWorker() {
|
|
846
826
|
options_mutex_.AssertHeld();
|
|
847
827
|
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
828
|
+
// We assume InstallSuperVersionForConfigChange has already ensured suitable
|
|
829
|
+
// mappings are present for each relevant CF. We just need to be sure the DB's
|
|
830
|
+
// seqno_to_time_mapping_ and worker scheduler are appropriate for the
|
|
831
|
+
// combination of CF settings.
|
|
832
|
+
|
|
833
|
+
MinAndMaxPreserveSeconds preserve_info;
|
|
834
|
+
uint64_t seqno_time_cadence;
|
|
851
835
|
{
|
|
852
836
|
InstrumentedMutexLock l(&mutex_);
|
|
853
837
|
|
|
854
838
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
855
|
-
auto& mopts =
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
std::max(mopts.preserve_internal_time_seconds,
|
|
859
|
-
mopts.preclude_last_level_data_seconds);
|
|
860
|
-
if (!cfd->IsDropped() && preserve_seconds > 0) {
|
|
861
|
-
min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
|
|
862
|
-
max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
|
|
839
|
+
auto& mopts = cfd->GetLatestMutableCFOptions();
|
|
840
|
+
if (!cfd->IsDropped()) {
|
|
841
|
+
preserve_info.Combine(mopts);
|
|
863
842
|
}
|
|
864
843
|
}
|
|
865
|
-
|
|
866
|
-
if (
|
|
867
|
-
//
|
|
844
|
+
seqno_time_cadence = preserve_info.GetRecodingCadence();
|
|
845
|
+
if (seqno_time_cadence == 0) {
|
|
846
|
+
// To return as much as possible to the feature being disabled,
|
|
847
|
+
// clear the existing mapping
|
|
868
848
|
seqno_to_time_mapping_.SetCapacity(0);
|
|
869
849
|
seqno_to_time_mapping_.SetMaxTimeSpan(UINT64_MAX);
|
|
850
|
+
assert(seqno_to_time_mapping_.Empty());
|
|
870
851
|
} else {
|
|
871
852
|
uint64_t cap = std::min(kMaxSeqnoToTimeEntries,
|
|
872
|
-
max_preserve_seconds *
|
|
873
|
-
|
|
853
|
+
preserve_info.max_preserve_seconds *
|
|
854
|
+
kMaxSeqnoTimePairsPerCF /
|
|
855
|
+
preserve_info.min_preserve_seconds);
|
|
874
856
|
seqno_to_time_mapping_.SetCapacity(cap);
|
|
875
|
-
seqno_to_time_mapping_.SetMaxTimeSpan(max_preserve_seconds);
|
|
876
|
-
}
|
|
877
|
-
if (old_mapping_size != seqno_to_time_mapping_.Size()) {
|
|
878
|
-
InstallSeqnoToTimeMappingInSV(&sv_contexts);
|
|
857
|
+
seqno_to_time_mapping_.SetMaxTimeSpan(preserve_info.max_preserve_seconds);
|
|
879
858
|
}
|
|
880
859
|
}
|
|
881
860
|
|
|
882
|
-
// clean up outside db mutex
|
|
883
|
-
for (SuperVersionContext& sv_context : sv_contexts) {
|
|
884
|
-
sv_context.Clean();
|
|
885
|
-
}
|
|
886
|
-
sv_contexts.clear();
|
|
887
|
-
|
|
888
|
-
uint64_t seqno_time_cadence = 0;
|
|
889
|
-
if (min_preserve_seconds != std::numeric_limits<uint64_t>::max()) {
|
|
890
|
-
// round up to 1 when the time_duration is smaller than
|
|
891
|
-
// kMaxSeqnoTimePairsPerCF
|
|
892
|
-
seqno_time_cadence = (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
|
|
893
|
-
kMaxSeqnoTimePairsPerCF;
|
|
894
|
-
}
|
|
895
|
-
|
|
896
861
|
TEST_SYNC_POINT_CALLBACK(
|
|
897
862
|
"DBImpl::RegisterRecordSeqnoTimeWorker:BeforePeriodicTaskType", nullptr);
|
|
898
863
|
|
|
@@ -900,68 +865,10 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
|
|
|
900
865
|
if (seqno_time_cadence == 0) {
|
|
901
866
|
s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
|
|
902
867
|
} else {
|
|
903
|
-
// Before registering the periodic task, we need to be sure to fulfill two
|
|
904
|
-
// promises:
|
|
905
|
-
// 1) Any DB created with preserve/preclude options set from the beginning
|
|
906
|
-
// will get pre-allocated seqnos with pre-populated time mappings back to
|
|
907
|
-
// the times we are interested in. (This will enable future import of data
|
|
908
|
-
// while preserving rough write time. We can only do this reliably from
|
|
909
|
-
// DB::Open, as otherwise there could be a race between CreateColumnFamily
|
|
910
|
-
// and the first Write to the DB, and seqno-to-time mappings need to be
|
|
911
|
-
// monotonic.
|
|
912
|
-
// 2) In any DB, any data written after setting preserve/preclude options
|
|
913
|
-
// must have a reasonable time estimate (so that we can accurately place
|
|
914
|
-
// the data), which means at least one entry in seqno_to_time_mapping_.
|
|
915
|
-
//
|
|
916
|
-
// FIXME: We don't currently guarantee that if the first column family with
|
|
917
|
-
// that setting is added or configured after initial DB::Open but before
|
|
918
|
-
// the first user Write. Fixing this causes complications with the crash
|
|
919
|
-
// test because if DB starts without preserve/preclude option, does some
|
|
920
|
-
// user writes but all those writes are lost in crash, then re-opens with
|
|
921
|
-
// preserve/preclude option, it sees seqno==1 which looks like one of the
|
|
922
|
-
// user writes was recovered, when actually it was not.
|
|
923
|
-
bool last_seqno_zero = GetLatestSequenceNumber() == 0;
|
|
924
|
-
assert(!is_new_db || last_seqno_zero);
|
|
925
|
-
if (is_new_db && last_seqno_zero) {
|
|
926
|
-
// Pre-allocate seqnos and pre-populate historical mapping
|
|
927
|
-
// We can simply modify these, before writes are allowed
|
|
928
|
-
constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
|
|
929
|
-
versions_->SetLastAllocatedSequence(kMax);
|
|
930
|
-
versions_->SetLastPublishedSequence(kMax);
|
|
931
|
-
versions_->SetLastSequence(kMax);
|
|
932
|
-
|
|
933
|
-
// And record in manifest, to avoid going backwards in seqno on re-open
|
|
934
|
-
// (potentially with different options). Concurrency is simple because we
|
|
935
|
-
// are in DB::Open
|
|
936
|
-
{
|
|
937
|
-
InstrumentedMutexLock l(&mutex_);
|
|
938
|
-
VersionEdit edit;
|
|
939
|
-
edit.SetLastSequence(kMax);
|
|
940
|
-
s = versions_->LogAndApplyToDefaultColumnFamily(
|
|
941
|
-
read_options, write_options, &edit, &mutex_,
|
|
942
|
-
directories_.GetDbDir());
|
|
943
|
-
if (!s.ok() && versions_->io_status().IsIOError()) {
|
|
944
|
-
error_handler_.SetBGError(versions_->io_status(),
|
|
945
|
-
BackgroundErrorReason::kManifestWrite);
|
|
946
|
-
}
|
|
947
|
-
}
|
|
948
|
-
|
|
949
|
-
// Pre-populate mappings for reserved sequence numbers.
|
|
950
|
-
RecordSeqnoToTimeMapping(max_preserve_seconds);
|
|
951
|
-
} else {
|
|
952
|
-
if (!last_seqno_zero) {
|
|
953
|
-
// Ensure at least one mapping (or log a warning), and
|
|
954
|
-
// an updated entry whenever relevant SetOptions is called
|
|
955
|
-
RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
|
|
956
|
-
} else {
|
|
957
|
-
// FIXME (see limitation described above)
|
|
958
|
-
}
|
|
959
|
-
}
|
|
960
|
-
|
|
961
868
|
s = periodic_task_scheduler_.Register(
|
|
962
869
|
PeriodicTaskType::kRecordSeqnoTime,
|
|
963
870
|
periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
|
|
964
|
-
seqno_time_cadence);
|
|
871
|
+
seqno_time_cadence, /*run_immediately=*/true);
|
|
965
872
|
}
|
|
966
873
|
|
|
967
874
|
return s;
|
|
@@ -1167,7 +1074,7 @@ void DBImpl::DumpStats() {
|
|
|
1167
1074
|
}
|
|
1168
1075
|
|
|
1169
1076
|
auto* table_factory =
|
|
1170
|
-
cfd->GetCurrentMutableCFOptions()
|
|
1077
|
+
cfd->GetCurrentMutableCFOptions().table_factory.get();
|
|
1171
1078
|
assert(table_factory != nullptr);
|
|
1172
1079
|
// FIXME: need to a shared_ptr if/when block_cache is going to be mutable
|
|
1173
1080
|
Cache* cache =
|
|
@@ -1252,11 +1159,11 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
|
|
|
1252
1159
|
|
|
1253
1160
|
void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
|
|
1254
1161
|
mutex_.AssertHeld();
|
|
1255
|
-
if (!job_context->
|
|
1256
|
-
for (auto l : job_context->
|
|
1162
|
+
if (!job_context->wals_to_free.empty()) {
|
|
1163
|
+
for (auto l : job_context->wals_to_free) {
|
|
1257
1164
|
AddToLogsToFreeQueue(l);
|
|
1258
1165
|
}
|
|
1259
|
-
job_context->
|
|
1166
|
+
job_context->wals_to_free.clear();
|
|
1260
1167
|
}
|
|
1261
1168
|
}
|
|
1262
1169
|
|
|
@@ -1286,36 +1193,72 @@ Status DBImpl::SetOptions(
|
|
|
1286
1193
|
}
|
|
1287
1194
|
|
|
1288
1195
|
InstrumentedMutexLock ol(&options_mutex_);
|
|
1289
|
-
MutableCFOptions
|
|
1196
|
+
MutableCFOptions new_options_copy; // For logging outside of DB mutex
|
|
1290
1197
|
Status s;
|
|
1291
1198
|
Status persist_options_status;
|
|
1292
1199
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
|
1293
1200
|
{
|
|
1294
1201
|
auto db_options = GetDBOptions();
|
|
1295
1202
|
InstrumentedMutexLock l(&mutex_);
|
|
1296
|
-
|
|
1203
|
+
// Manifest writers + Version appenders like flush and compaction use
|
|
1204
|
+
// LogAndApply, which releases DB mutex to wait for other manifest writers
|
|
1205
|
+
// and for the manifest write. We need to append a Version for the options
|
|
1206
|
+
// to take full effect (e.g. compaction scores), but we don't want to
|
|
1207
|
+
// interleave with other callers of LogAndApply, which could at least
|
|
1208
|
+
// temporarily roll back option changes. Thus, we use a special call to
|
|
1209
|
+
// LogAndApply that allows us to
|
|
1210
|
+
//
|
|
1211
|
+
// (a) Apply the options update when we know we are the exclusive version
|
|
1212
|
+
// appender + (fake) manifest writer, and
|
|
1213
|
+
//
|
|
1214
|
+
// (b) Append a new Version without manifest write nor DB mutex release
|
|
1215
|
+
//
|
|
1216
|
+
// Thus aren't releasing the DB mutex from LogAndApply calling pre_cb,
|
|
1217
|
+
// through installing the new Version until the end of this block, after
|
|
1218
|
+
// installing the new SuperVersion.
|
|
1219
|
+
auto pre_cb = [&]() -> Status {
|
|
1220
|
+
Status cb_s = cfd->SetOptions(db_options, options_map);
|
|
1221
|
+
if (cb_s.ok()) {
|
|
1222
|
+
new_options_copy = cfd->GetLatestMutableCFOptions();
|
|
1223
|
+
}
|
|
1224
|
+
return cb_s;
|
|
1225
|
+
};
|
|
1226
|
+
VersionEdit dummy_edit;
|
|
1227
|
+
dummy_edit.MarkNoManifestWriteDummy();
|
|
1228
|
+
TEST_SYNC_POINT_CALLBACK("DBImpl::SetOptions:dummy_edit", &dummy_edit);
|
|
1229
|
+
s = versions_->LogAndApply(
|
|
1230
|
+
cfd, read_options, write_options, &dummy_edit, &mutex_,
|
|
1231
|
+
directories_.GetDbDir(), false /*new_descriptor_log=*/,
|
|
1232
|
+
nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb);
|
|
1233
|
+
if (!versions_->io_status().ok()) {
|
|
1234
|
+
assert(!s.ok());
|
|
1235
|
+
error_handler_.SetBGError(versions_->io_status(),
|
|
1236
|
+
BackgroundErrorReason::kManifestWrite);
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1297
1239
|
if (s.ok()) {
|
|
1298
|
-
new_options = *cfd->GetLatestMutableCFOptions();
|
|
1299
|
-
// Append new version to recompute compaction score.
|
|
1300
|
-
VersionEdit dummy_edit;
|
|
1301
|
-
s = versions_->LogAndApply(cfd, new_options, read_options, write_options,
|
|
1302
|
-
&dummy_edit, &mutex_, directories_.GetDbDir());
|
|
1303
1240
|
// Trigger possible flush/compactions. This has to be before we persist
|
|
1304
1241
|
// options to file, otherwise there will be a deadlock with writer
|
|
1305
1242
|
// thread.
|
|
1306
|
-
|
|
1307
|
-
|
|
1243
|
+
InstallSuperVersionForConfigChange(cfd, &sv_context);
|
|
1308
1244
|
persist_options_status =
|
|
1309
1245
|
WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
|
|
1310
1246
|
bg_cv_.SignalAll();
|
|
1247
|
+
|
|
1248
|
+
#if __cplusplus >= 202002L
|
|
1249
|
+
assert(new_options_copy == cfd->GetLatestMutableCFOptions());
|
|
1250
|
+
assert(cfd->GetLatestMutableCFOptions() ==
|
|
1251
|
+
cfd->GetCurrentMutableCFOptions());
|
|
1252
|
+
assert(cfd->GetCurrentMutableCFOptions() ==
|
|
1253
|
+
cfd->current()->GetMutableCFOptions());
|
|
1254
|
+
#endif
|
|
1311
1255
|
}
|
|
1312
1256
|
}
|
|
1313
1257
|
sv_context.Clean();
|
|
1314
1258
|
|
|
1315
1259
|
if (s.ok() && (options_map.count("preserve_internal_time_seconds") > 0 ||
|
|
1316
1260
|
options_map.count("preclude_last_level_data_seconds") > 0)) {
|
|
1317
|
-
s = RegisterRecordSeqnoTimeWorker(
|
|
1318
|
-
false /* is_new_db*/);
|
|
1261
|
+
s = RegisterRecordSeqnoTimeWorker();
|
|
1319
1262
|
}
|
|
1320
1263
|
|
|
1321
1264
|
ROCKS_LOG_INFO(
|
|
@@ -1328,7 +1271,7 @@ Status DBImpl::SetOptions(
|
|
|
1328
1271
|
if (s.ok()) {
|
|
1329
1272
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
1330
1273
|
"[%s] SetOptions() succeeded", cfd->GetName().c_str());
|
|
1331
|
-
|
|
1274
|
+
new_options_copy.Dump(immutable_db_options_.info_log.get());
|
|
1332
1275
|
if (!persist_options_status.ok()) {
|
|
1333
1276
|
// NOTE: WriteOptionsFile already logs on failure
|
|
1334
1277
|
s = persist_options_status;
|
|
@@ -1435,7 +1378,7 @@ Status DBImpl::SetDBOptions(
|
|
|
1435
1378
|
s = periodic_task_scheduler_.Register(
|
|
1436
1379
|
PeriodicTaskType::kDumpStats,
|
|
1437
1380
|
periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
|
|
1438
|
-
new_options.stats_dump_period_sec);
|
|
1381
|
+
new_options.stats_dump_period_sec, /*run_immediately=*/true);
|
|
1439
1382
|
}
|
|
1440
1383
|
if (new_options.max_total_wal_size !=
|
|
1441
1384
|
mutable_db_options_.max_total_wal_size) {
|
|
@@ -1450,7 +1393,7 @@ Status DBImpl::SetDBOptions(
|
|
|
1450
1393
|
s = periodic_task_scheduler_.Register(
|
|
1451
1394
|
PeriodicTaskType::kPersistStats,
|
|
1452
1395
|
periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
|
|
1453
|
-
new_options.stats_persist_period_sec);
|
|
1396
|
+
new_options.stats_persist_period_sec, /*run_immediately=*/true);
|
|
1454
1397
|
}
|
|
1455
1398
|
}
|
|
1456
1399
|
mutex_.Lock();
|
|
@@ -1479,7 +1422,7 @@ Status DBImpl::SetDBOptions(
|
|
|
1479
1422
|
WriteThread::Writer w;
|
|
1480
1423
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
|
1481
1424
|
if (wal_other_option_changed ||
|
|
1482
|
-
|
|
1425
|
+
wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize()) {
|
|
1483
1426
|
Status purge_wal_status = SwitchWAL(&write_context);
|
|
1484
1427
|
if (!purge_wal_status.ok()) {
|
|
1485
1428
|
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
@@ -1506,14 +1449,9 @@ Status DBImpl::SetDBOptions(
|
|
|
1506
1449
|
ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
|
|
1507
1450
|
new_options.Dump(immutable_db_options_.info_log.get());
|
|
1508
1451
|
if (!persist_options_status.ok()) {
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
persist_options_status.ToString());
|
|
1513
|
-
}
|
|
1514
|
-
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
1515
|
-
"Unable to persist options in SetDBOptions() -- %s",
|
|
1516
|
-
persist_options_status.ToString().c_str());
|
|
1452
|
+
s = Status::IOError(
|
|
1453
|
+
"SetDBOptions() succeeded, but unable to persist options",
|
|
1454
|
+
persist_options_status.ToString());
|
|
1517
1455
|
}
|
|
1518
1456
|
} else {
|
|
1519
1457
|
ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
|
|
@@ -1548,8 +1486,8 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
|
|
|
1548
1486
|
if (manual_wal_flush_) {
|
|
1549
1487
|
IOStatus io_s;
|
|
1550
1488
|
{
|
|
1551
|
-
// We need to lock
|
|
1552
|
-
InstrumentedMutexLock wl(&
|
|
1489
|
+
// We need to lock wal_write_mutex_ since logs_ might change concurrently
|
|
1490
|
+
InstrumentedMutexLock wl(&wal_write_mutex_);
|
|
1553
1491
|
log::Writer* cur_log_writer = logs_.back().writer;
|
|
1554
1492
|
io_s = cur_log_writer->WriteBuffer(write_options);
|
|
1555
1493
|
}
|
|
@@ -1576,7 +1514,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
|
|
|
1576
1514
|
}
|
|
1577
1515
|
|
|
1578
1516
|
bool DBImpl::WALBufferIsEmpty() {
|
|
1579
|
-
InstrumentedMutexLock l(&
|
|
1517
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
1580
1518
|
log::Writer* cur_log_writer = logs_.back().writer;
|
|
1581
1519
|
auto res = cur_log_writer->BufferIsEmpty();
|
|
1582
1520
|
return res;
|
|
@@ -1584,7 +1522,7 @@ bool DBImpl::WALBufferIsEmpty() {
|
|
|
1584
1522
|
|
|
1585
1523
|
Status DBImpl::GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size) {
|
|
1586
1524
|
assert(number_to_size.empty());
|
|
1587
|
-
InstrumentedMutexLock l(&
|
|
1525
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
1588
1526
|
for (auto& log : logs_) {
|
|
1589
1527
|
auto* open_file = log.writer->file();
|
|
1590
1528
|
if (open_file) {
|
|
@@ -1626,15 +1564,15 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
|
|
|
1626
1564
|
uint64_t up_to_number;
|
|
1627
1565
|
|
|
1628
1566
|
{
|
|
1629
|
-
InstrumentedMutexLock l(&
|
|
1567
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
1630
1568
|
assert(!logs_.empty());
|
|
1631
1569
|
|
|
1632
|
-
maybe_active_number =
|
|
1570
|
+
maybe_active_number = cur_wal_number_;
|
|
1633
1571
|
up_to_number =
|
|
1634
1572
|
include_current_wal ? maybe_active_number : maybe_active_number - 1;
|
|
1635
1573
|
|
|
1636
1574
|
while (logs_.front().number <= up_to_number && logs_.front().IsSyncing()) {
|
|
1637
|
-
|
|
1575
|
+
wal_sync_cv_.Wait();
|
|
1638
1576
|
}
|
|
1639
1577
|
// First check that logs are safe to sync in background.
|
|
1640
1578
|
if (include_current_wal &&
|
|
@@ -1658,7 +1596,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
|
|
|
1658
1596
|
}
|
|
1659
1597
|
}
|
|
1660
1598
|
|
|
1661
|
-
need_wal_dir_sync = !
|
|
1599
|
+
need_wal_dir_sync = !wal_dir_synced_;
|
|
1662
1600
|
}
|
|
1663
1601
|
|
|
1664
1602
|
if (include_current_wal) {
|
|
@@ -1731,7 +1669,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
|
|
|
1731
1669
|
/*arg=*/nullptr);
|
|
1732
1670
|
}
|
|
1733
1671
|
{
|
|
1734
|
-
InstrumentedMutexLock l(&
|
|
1672
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
1735
1673
|
for (auto* wal : wals_internally_closed) {
|
|
1736
1674
|
// We can only modify the state of log::Writer under the mutex
|
|
1737
1675
|
bool was_closed = wal->PublishIfClosed();
|
|
@@ -1848,9 +1786,9 @@ Status DBImpl::UnlockWAL() {
|
|
|
1848
1786
|
|
|
1849
1787
|
void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
|
|
1850
1788
|
VersionEdit* synced_wals) {
|
|
1851
|
-
|
|
1852
|
-
if (synced_dir &&
|
|
1853
|
-
|
|
1789
|
+
wal_write_mutex_.AssertHeld();
|
|
1790
|
+
if (synced_dir && cur_wal_number_ == up_to) {
|
|
1791
|
+
wal_dir_synced_ = true;
|
|
1854
1792
|
}
|
|
1855
1793
|
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
|
|
1856
1794
|
auto& wal = *it;
|
|
@@ -1872,7 +1810,7 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
|
|
|
1872
1810
|
(immutable_db_options_.background_close_inactive_wals &&
|
|
1873
1811
|
wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize())) {
|
|
1874
1812
|
// Fully synced
|
|
1875
|
-
|
|
1813
|
+
wals_to_free_.push_back(wal.ReleaseWriter());
|
|
1876
1814
|
it = logs_.erase(it);
|
|
1877
1815
|
} else {
|
|
1878
1816
|
wal.FinishSync();
|
|
@@ -1885,17 +1823,17 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
|
|
|
1885
1823
|
++it;
|
|
1886
1824
|
}
|
|
1887
1825
|
}
|
|
1888
|
-
|
|
1826
|
+
wal_sync_cv_.SignalAll();
|
|
1889
1827
|
}
|
|
1890
1828
|
|
|
1891
1829
|
void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
|
|
1892
|
-
|
|
1830
|
+
wal_write_mutex_.AssertHeld();
|
|
1893
1831
|
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
|
|
1894
1832
|
++it) {
|
|
1895
1833
|
auto& wal = *it;
|
|
1896
1834
|
wal.FinishSync();
|
|
1897
1835
|
}
|
|
1898
|
-
|
|
1836
|
+
wal_sync_cv_.SignalAll();
|
|
1899
1837
|
}
|
|
1900
1838
|
|
|
1901
1839
|
SequenceNumber DBImpl::GetLatestSequenceNumber() const {
|
|
@@ -1931,6 +1869,69 @@ Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
|
|
|
1931
1869
|
return Status::OK();
|
|
1932
1870
|
}
|
|
1933
1871
|
|
|
1872
|
+
Status DBImpl::GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
|
|
1873
|
+
std::string* newest_timestamp) {
|
|
1874
|
+
if (newest_timestamp == nullptr) {
|
|
1875
|
+
return Status::InvalidArgument("newest_timestamp is nullptr");
|
|
1876
|
+
}
|
|
1877
|
+
ColumnFamilyData* cfd = nullptr;
|
|
1878
|
+
if (column_family == nullptr) {
|
|
1879
|
+
cfd = default_cf_handle_->cfd();
|
|
1880
|
+
} else {
|
|
1881
|
+
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
1882
|
+
assert(cfh != nullptr);
|
|
1883
|
+
cfd = cfh->cfd();
|
|
1884
|
+
}
|
|
1885
|
+
assert(cfd != nullptr && cfd->user_comparator() != nullptr);
|
|
1886
|
+
if (cfd->user_comparator()->timestamp_size() == 0) {
|
|
1887
|
+
return Status::InvalidArgument(
|
|
1888
|
+
"Timestamp is not enabled in this column family");
|
|
1889
|
+
}
|
|
1890
|
+
if (cfd->ioptions().persist_user_defined_timestamps) {
|
|
1891
|
+
return Status::NotSupported(
|
|
1892
|
+
"GetNewestUserDefinedTimestamp doesn't support the case when user"
|
|
1893
|
+
"defined timestamps are persisted.");
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
Status status;
|
|
1897
|
+
// Acquire SuperVersion
|
|
1898
|
+
SuperVersion* sv = GetAndRefSuperVersion(cfd);
|
|
1899
|
+
{
|
|
1900
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1901
|
+
bool enter_write_thread = sv->mem == cfd->mem();
|
|
1902
|
+
WriteThread::Writer w;
|
|
1903
|
+
// Enter write thread to read the mutable memtable to avoid racing access
|
|
1904
|
+
// with concurrent writes. No need to enter nonmem_write_thread_ since this
|
|
1905
|
+
// call only care about memtable writes, not WAL writes.
|
|
1906
|
+
if (enter_write_thread) {
|
|
1907
|
+
write_thread_.EnterUnbatched(&w, &mutex_);
|
|
1908
|
+
WaitForPendingWrites();
|
|
1909
|
+
}
|
|
1910
|
+
*newest_timestamp = sv->mem->GetNewestUDT().ToString();
|
|
1911
|
+
assert(!newest_timestamp->empty() || sv->mem->IsEmpty());
|
|
1912
|
+
if (enter_write_thread) {
|
|
1913
|
+
write_thread_.ExitUnbatched(&w);
|
|
1914
|
+
}
|
|
1915
|
+
}
|
|
1916
|
+
// Read from immutable memtables if nothing found in mutable memtable.
|
|
1917
|
+
if (newest_timestamp->empty()) {
|
|
1918
|
+
*newest_timestamp = sv->imm->GetNewestUDT().ToString();
|
|
1919
|
+
}
|
|
1920
|
+
// Read from SST files if no result can be found in memtables.
|
|
1921
|
+
if (newest_timestamp->empty() && sv->current->GetSstFilesSize() != 0) {
|
|
1922
|
+
// full_history_ts_low is used to track the exclusive upperbound of
|
|
1923
|
+
// flushed user defined timestamp. So we can use it to deduce the newest
|
|
1924
|
+
// timestamp in the SST files that the column family has seen.
|
|
1925
|
+
Slice full_history_ts_low = sv->full_history_ts_low;
|
|
1926
|
+
if (!full_history_ts_low.empty()) {
|
|
1927
|
+
GetU64CutoffTsFromFullHistoryTsLow(&full_history_ts_low,
|
|
1928
|
+
newest_timestamp);
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
1932
|
+
return status;
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1934
1935
|
InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
|
|
1935
1936
|
Arena* arena,
|
|
1936
1937
|
SequenceNumber sequence,
|
|
@@ -1964,10 +1965,10 @@ void DBImpl::BackgroundCallPurge() {
|
|
|
1964
1965
|
TEST_SYNC_POINT("DBImpl::BackgroundCallPurge:beforeMutexLock");
|
|
1965
1966
|
mutex_.Lock();
|
|
1966
1967
|
|
|
1967
|
-
while (!
|
|
1968
|
-
assert(!
|
|
1969
|
-
log::Writer* log_writer = *(
|
|
1970
|
-
|
|
1968
|
+
while (!wals_to_free_queue_.empty()) {
|
|
1969
|
+
assert(!wals_to_free_queue_.empty());
|
|
1970
|
+
log::Writer* log_writer = *(wals_to_free_queue_.begin());
|
|
1971
|
+
wals_to_free_queue_.pop_front();
|
|
1971
1972
|
mutex_.Unlock();
|
|
1972
1973
|
delete log_writer;
|
|
1973
1974
|
mutex_.Lock();
|
|
@@ -2110,7 +2111,7 @@ InternalIterator* DBImpl::NewInternalIterator(
|
|
|
2110
2111
|
} else {
|
|
2111
2112
|
mem_tombstone_iter = std::make_unique<TruncatedRangeDelIterator>(
|
|
2112
2113
|
std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
|
|
2113
|
-
&cfd->ioptions()
|
|
2114
|
+
&cfd->ioptions().internal_comparator, nullptr /* smallest */,
|
|
2114
2115
|
nullptr /* largest */);
|
|
2115
2116
|
}
|
|
2116
2117
|
merge_iter_builder.AddPointAndTombstoneIterator(
|
|
@@ -2559,6 +2560,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
|
|
|
2559
2560
|
// Return all merge operands for get_impl_options.key
|
|
2560
2561
|
*get_impl_options.number_of_operands =
|
|
2561
2562
|
static_cast<int>(merge_context.GetNumOperands());
|
|
2563
|
+
// OK status is returned, some merge operand is found.
|
|
2564
|
+
assert(*get_impl_options.number_of_operands > 0);
|
|
2562
2565
|
if (*get_impl_options.number_of_operands >
|
|
2563
2566
|
get_impl_options.get_merge_operands_options
|
|
2564
2567
|
->expected_max_number_of_operands) {
|
|
@@ -2663,7 +2666,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
|
|
|
2663
2666
|
}
|
|
2664
2667
|
};
|
|
2665
2668
|
|
|
2666
|
-
bool
|
|
2669
|
+
bool acquire_mutex = false;
|
|
2667
2670
|
if (cf_list->size() == 1) {
|
|
2668
2671
|
// Fast path for a single column family. We can simply get the thread local
|
|
2669
2672
|
// super version
|
|
@@ -2712,29 +2715,32 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
|
|
|
2712
2715
|
// sure.
|
|
2713
2716
|
constexpr int num_retries = 3;
|
|
2714
2717
|
for (int i = 0; i < num_retries; ++i) {
|
|
2715
|
-
|
|
2718
|
+
// When reading from kPersistedTier, we want a consistent view into CFs.
|
|
2719
|
+
// So we take mutex to prevent any SV change in any CF.
|
|
2720
|
+
acquire_mutex = ((i == num_retries - 1) && !read_options.snapshot) ||
|
|
2721
|
+
read_options.read_tier == kPersistedTier;
|
|
2716
2722
|
bool retry = false;
|
|
2717
2723
|
|
|
2718
2724
|
if (i > 0) {
|
|
2719
2725
|
sv_cleanup_func();
|
|
2720
2726
|
}
|
|
2721
2727
|
if (read_options.snapshot == nullptr) {
|
|
2722
|
-
if (last_try) {
|
|
2723
|
-
TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
|
|
2724
|
-
// We're close to max number of retries. For the last retry,
|
|
2725
|
-
// acquire the lock so we're sure to succeed
|
|
2726
|
-
mutex_.Lock();
|
|
2727
|
-
}
|
|
2728
2728
|
*snapshot = GetLastPublishedSequence();
|
|
2729
2729
|
} else {
|
|
2730
2730
|
*snapshot =
|
|
2731
2731
|
static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
|
|
2732
2732
|
->number_;
|
|
2733
2733
|
}
|
|
2734
|
+
if (acquire_mutex) {
|
|
2735
|
+
TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
|
|
2736
|
+
// We're close to max number of retries. For the last retry,
|
|
2737
|
+
// acquire the lock so we're sure to succeed
|
|
2738
|
+
mutex_.Lock();
|
|
2739
|
+
}
|
|
2734
2740
|
for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
|
|
2735
2741
|
++cf_iter) {
|
|
2736
2742
|
auto node = iter_deref_func(cf_iter);
|
|
2737
|
-
if (!
|
|
2743
|
+
if (!acquire_mutex) {
|
|
2738
2744
|
if (extra_sv_ref) {
|
|
2739
2745
|
node->super_version = node->cfd->GetReferencedSuperVersion(this);
|
|
2740
2746
|
} else {
|
|
@@ -2758,7 +2764,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
|
|
|
2758
2764
|
}
|
|
2759
2765
|
}
|
|
2760
2766
|
TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeCheckingSnapshot");
|
|
2761
|
-
if (read_options.snapshot != nullptr ||
|
|
2767
|
+
if (read_options.snapshot != nullptr || acquire_mutex) {
|
|
2762
2768
|
// If user passed a snapshot, then we don't care if a memtable is
|
|
2763
2769
|
// sealed or compaction happens because the snapshot would ensure
|
|
2764
2770
|
// that older key versions are kept around. If this is the last
|
|
@@ -2769,7 +2775,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
|
|
|
2769
2775
|
// memtables, which will include immutable memtables as well, but that
|
|
2770
2776
|
// might be tricky to maintain in case we decide, in future, to do
|
|
2771
2777
|
// memtable compaction.
|
|
2772
|
-
if (!
|
|
2778
|
+
if (!acquire_mutex) {
|
|
2773
2779
|
SequenceNumber seq =
|
|
2774
2780
|
node->super_version->mem->GetEarliestSequenceNumber();
|
|
2775
2781
|
if (seq > *snapshot) {
|
|
@@ -2779,19 +2785,20 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
|
|
|
2779
2785
|
}
|
|
2780
2786
|
}
|
|
2781
2787
|
if (!retry) {
|
|
2782
|
-
if (
|
|
2788
|
+
if (acquire_mutex) {
|
|
2783
2789
|
mutex_.Unlock();
|
|
2784
2790
|
TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterLastTryRefSV");
|
|
2785
2791
|
}
|
|
2786
2792
|
break;
|
|
2787
2793
|
}
|
|
2794
|
+
assert(!acquire_mutex);
|
|
2788
2795
|
}
|
|
2789
2796
|
}
|
|
2790
2797
|
|
|
2791
2798
|
TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum1");
|
|
2792
2799
|
TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum2");
|
|
2793
2800
|
PERF_TIMER_STOP(get_snapshot_time);
|
|
2794
|
-
*sv_from_thread_local = !
|
|
2801
|
+
*sv_from_thread_local = !acquire_mutex;
|
|
2795
2802
|
if (!s.ok()) {
|
|
2796
2803
|
sv_cleanup_func();
|
|
2797
2804
|
}
|
|
@@ -3497,7 +3504,7 @@ void DBImpl::MultiGetEntityWithCallback(
|
|
|
3497
3504
|
}
|
|
3498
3505
|
|
|
3499
3506
|
Status DBImpl::WrapUpCreateColumnFamilies(
|
|
3500
|
-
const
|
|
3507
|
+
const WriteOptions& write_options,
|
|
3501
3508
|
const std::vector<const ColumnFamilyOptions*>& cf_options) {
|
|
3502
3509
|
options_mutex_.AssertHeld();
|
|
3503
3510
|
|
|
@@ -3514,8 +3521,7 @@ Status DBImpl::WrapUpCreateColumnFamilies(
|
|
|
3514
3521
|
// Attempt both follow-up actions even if one fails
|
|
3515
3522
|
Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/);
|
|
3516
3523
|
if (register_worker) {
|
|
3517
|
-
s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(
|
|
3518
|
-
/* is_new_db */ false));
|
|
3524
|
+
s.UpdateIfOk(RegisterRecordSeqnoTimeWorker());
|
|
3519
3525
|
}
|
|
3520
3526
|
return s;
|
|
3521
3527
|
}
|
|
@@ -3530,8 +3536,7 @@ Status DBImpl::CreateColumnFamily(const ReadOptions& read_options,
|
|
|
3530
3536
|
Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options,
|
|
3531
3537
|
column_family, handle);
|
|
3532
3538
|
if (s.ok()) {
|
|
3533
|
-
s.UpdateIfOk(
|
|
3534
|
-
WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
|
|
3539
|
+
s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
|
|
3535
3540
|
}
|
|
3536
3541
|
return s;
|
|
3537
3542
|
}
|
|
@@ -3558,8 +3563,7 @@ Status DBImpl::CreateColumnFamilies(
|
|
|
3558
3563
|
success_once = true;
|
|
3559
3564
|
}
|
|
3560
3565
|
if (success_once) {
|
|
3561
|
-
s.UpdateIfOk(
|
|
3562
|
-
WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
|
|
3566
|
+
s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
|
|
3563
3567
|
}
|
|
3564
3568
|
return s;
|
|
3565
3569
|
}
|
|
@@ -3589,8 +3593,7 @@ Status DBImpl::CreateColumnFamilies(
|
|
|
3589
3593
|
cf_opts.push_back(&column_families[i].options);
|
|
3590
3594
|
}
|
|
3591
3595
|
if (success_once) {
|
|
3592
|
-
s.UpdateIfOk(
|
|
3593
|
-
WrapUpCreateColumnFamilies(read_options, write_options, cf_opts));
|
|
3596
|
+
s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, cf_opts));
|
|
3594
3597
|
}
|
|
3595
3598
|
return s;
|
|
3596
3599
|
}
|
|
@@ -3631,7 +3634,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
|
|
|
3631
3634
|
edit.AddColumnFamily(column_family_name);
|
|
3632
3635
|
uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
|
|
3633
3636
|
edit.SetColumnFamily(new_id);
|
|
3634
|
-
edit.SetLogNumber(
|
|
3637
|
+
edit.SetLogNumber(cur_wal_number_);
|
|
3635
3638
|
edit.SetComparatorName(cf_options.comparator->Name());
|
|
3636
3639
|
edit.SetPersistUserDefinedTimestamps(
|
|
3637
3640
|
cf_options.persist_user_defined_timestamps);
|
|
@@ -3643,9 +3646,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
|
|
|
3643
3646
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
|
3644
3647
|
// LogAndApply will both write the creation in MANIFEST and create
|
|
3645
3648
|
// ColumnFamilyData object
|
|
3646
|
-
s = versions_->LogAndApply(nullptr,
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
+
s = versions_->LogAndApply(nullptr, read_options, write_options, &edit,
|
|
3650
|
+
&mutex_, directories_.GetDbDir(), false,
|
|
3651
|
+
&cf_options);
|
|
3649
3652
|
write_thread_.ExitUnbatched(&w);
|
|
3650
3653
|
}
|
|
3651
3654
|
if (s.ok()) {
|
|
@@ -3659,8 +3662,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
|
|
|
3659
3662
|
auto* cfd =
|
|
3660
3663
|
versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
|
|
3661
3664
|
assert(cfd != nullptr);
|
|
3662
|
-
|
|
3663
|
-
*cfd->GetLatestMutableCFOptions());
|
|
3665
|
+
InstallSuperVersionForConfigChange(cfd, &sv_context);
|
|
3664
3666
|
|
|
3665
3667
|
if (!cfd->mem()->IsSnapshotSupported()) {
|
|
3666
3668
|
is_snapshot_supported_ = false;
|
|
@@ -3744,7 +3746,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
|
|
|
3744
3746
|
Status s;
|
|
3745
3747
|
// Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not
|
|
3746
3748
|
// applicable
|
|
3747
|
-
|
|
3749
|
+
MinAndMaxPreserveSeconds preserve_info;
|
|
3748
3750
|
{
|
|
3749
3751
|
InstrumentedMutexLock l(&mutex_);
|
|
3750
3752
|
if (cfd->IsDropped()) {
|
|
@@ -3754,17 +3756,15 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
|
|
|
3754
3756
|
// we drop column family from a single write thread
|
|
3755
3757
|
WriteThread::Writer w;
|
|
3756
3758
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
|
3757
|
-
s = versions_->LogAndApply(cfd,
|
|
3758
|
-
|
|
3759
|
-
directories_.GetDbDir());
|
|
3759
|
+
s = versions_->LogAndApply(cfd, read_options, write_options, &edit,
|
|
3760
|
+
&mutex_, directories_.GetDbDir());
|
|
3760
3761
|
write_thread_.ExitUnbatched(&w);
|
|
3761
3762
|
}
|
|
3762
3763
|
if (s.ok()) {
|
|
3763
|
-
auto& moptions =
|
|
3764
|
+
auto& moptions = cfd->GetLatestMutableCFOptions();
|
|
3764
3765
|
max_total_in_memory_state_ -=
|
|
3765
3766
|
moptions.write_buffer_size * moptions.max_write_buffer_number;
|
|
3766
|
-
|
|
3767
|
-
moptions.preclude_last_level_data_seconds > 0;
|
|
3767
|
+
preserve_info.Combine(moptions);
|
|
3768
3768
|
}
|
|
3769
3769
|
|
|
3770
3770
|
if (!cf_support_snapshot) {
|
|
@@ -3782,9 +3782,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
|
|
|
3782
3782
|
bg_cv_.SignalAll();
|
|
3783
3783
|
}
|
|
3784
3784
|
|
|
3785
|
-
if (
|
|
3786
|
-
s = RegisterRecordSeqnoTimeWorker(
|
|
3787
|
-
/* is_new_db */ false);
|
|
3785
|
+
if (preserve_info.IsEnabled()) {
|
|
3786
|
+
s = RegisterRecordSeqnoTimeWorker();
|
|
3788
3787
|
}
|
|
3789
3788
|
|
|
3790
3789
|
if (s.ok()) {
|
|
@@ -3834,6 +3833,16 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
|
|
|
3834
3833
|
return s.ok() || s.IsIncomplete();
|
|
3835
3834
|
}
|
|
3836
3835
|
|
|
3836
|
+
std::unique_ptr<MultiScan> DBImpl::NewMultiScan(
|
|
3837
|
+
const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
|
|
3838
|
+
const std::vector<ScanOptions>& scan_opts) {
|
|
3839
|
+
std::unique_ptr<Iterator> iter(NewIterator(_read_options, column_family));
|
|
3840
|
+
iter->Prepare(scan_opts);
|
|
3841
|
+
std::unique_ptr<MultiScan> ms_iter =
|
|
3842
|
+
std::make_unique<MultiScan>(scan_opts, std::move(iter));
|
|
3843
|
+
return ms_iter;
|
|
3844
|
+
}
|
|
3845
|
+
|
|
3837
3846
|
Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
|
|
3838
3847
|
ColumnFamilyHandle* column_family) {
|
|
3839
3848
|
if (_read_options.io_activity != Env::IOActivity::kUnknown &&
|
|
@@ -3890,11 +3899,14 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
|
|
|
3890
3899
|
|
|
3891
3900
|
auto iter = new ForwardIterator(this, read_options, cfd, sv,
|
|
3892
3901
|
/* allow_unprepared_value */ true);
|
|
3893
|
-
|
|
3894
|
-
|
|
3895
|
-
|
|
3896
|
-
|
|
3897
|
-
|
|
3902
|
+
// TODO(cbi): Add support for `memtable_op_scan_flush_trigger` for tailing
|
|
3903
|
+
// iterator. This requires refreshing DBIter's pointer to active_mem when
|
|
3904
|
+
// tailing iterator refreshes to new memtable internally.
|
|
3905
|
+
result = DBIter::NewIter(env_, read_options, cfd->ioptions(),
|
|
3906
|
+
sv->mutable_cf_options, cfd->user_comparator(),
|
|
3907
|
+
iter, sv->current, kMaxSequenceNumber,
|
|
3908
|
+
/*read_callback=*/nullptr, /*active_mem=*/nullptr,
|
|
3909
|
+
cfh, /*expose_blob_index=*/false);
|
|
3898
3910
|
} else {
|
|
3899
3911
|
// Note: no need to consider the special case of
|
|
3900
3912
|
// last_seq_same_as_publish_seq_==false since NewIterator is overridden in
|
|
@@ -3972,18 +3984,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(
|
|
|
3972
3984
|
// Laying out the iterators in the order of being accessed makes it more
|
|
3973
3985
|
// likely that any iterator pointer is close to the iterator it points to so
|
|
3974
3986
|
// that they are likely to be in the same cache line and/or page.
|
|
3975
|
-
|
|
3976
|
-
env_, read_options,
|
|
3977
|
-
|
|
3978
|
-
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
3979
|
-
sv->version_number, read_callback, cfh, expose_blob_index, allow_refresh);
|
|
3980
|
-
|
|
3981
|
-
InternalIterator* internal_iter = NewInternalIterator(
|
|
3982
|
-
db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), snapshot,
|
|
3983
|
-
/* allow_unprepared_value */ true, db_iter);
|
|
3984
|
-
db_iter->SetIterUnderDBIter(internal_iter);
|
|
3985
|
-
|
|
3986
|
-
return db_iter;
|
|
3987
|
+
return NewArenaWrappedDbIterator(
|
|
3988
|
+
env_, read_options, cfh, sv, snapshot, read_callback, this,
|
|
3989
|
+
expose_blob_index, allow_refresh, /*allow_mark_memtable_for_flush=*/true);
|
|
3987
3990
|
}
|
|
3988
3991
|
|
|
3989
3992
|
std::unique_ptr<Iterator> DBImpl::NewCoalescingIterator(
|
|
@@ -4107,14 +4110,12 @@ Status DBImpl::NewIterators(
|
|
|
4107
4110
|
auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
|
|
4108
4111
|
cf_sv_pair.super_version,
|
|
4109
4112
|
/* allow_unprepared_value */ true);
|
|
4110
|
-
iterators->push_back(
|
|
4111
|
-
|
|
4112
|
-
|
|
4113
|
-
|
|
4114
|
-
|
|
4115
|
-
|
|
4116
|
-
.max_sequential_skip_in_iterations,
|
|
4117
|
-
nullptr /*read_callback*/, cf_sv_pair.cfh));
|
|
4113
|
+
iterators->push_back(DBIter::NewIter(
|
|
4114
|
+
env_, read_options, cf_sv_pair.cfd->ioptions(),
|
|
4115
|
+
cf_sv_pair.super_version->mutable_cf_options,
|
|
4116
|
+
cf_sv_pair.cfd->user_comparator(), iter,
|
|
4117
|
+
cf_sv_pair.super_version->current, kMaxSequenceNumber,
|
|
4118
|
+
nullptr /*read_callback*/, /*active_mem=*/nullptr, cf_sv_pair.cfh));
|
|
4118
4119
|
}
|
|
4119
4120
|
} else {
|
|
4120
4121
|
for (const auto& cf_sv_pair : cf_sv_pairs) {
|
|
@@ -4346,7 +4347,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
|
|
4346
4347
|
CfdList cf_scheduled;
|
|
4347
4348
|
if (oldest_snapshot > bottommost_files_mark_threshold_) {
|
|
4348
4349
|
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
|
4349
|
-
if (!cfd->ioptions()
|
|
4350
|
+
if (!cfd->ioptions().allow_ingest_behind) {
|
|
4350
4351
|
cfd->current()->storage_info()->UpdateOldestSnapshot(
|
|
4351
4352
|
oldest_snapshot, /*allow_ingest_behind=*/false);
|
|
4352
4353
|
if (!cfd->current()
|
|
@@ -4367,7 +4368,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
|
|
4367
4368
|
SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
|
|
4368
4369
|
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
|
4369
4370
|
if (CfdListContains(cf_scheduled, cfd) ||
|
|
4370
|
-
cfd->ioptions()
|
|
4371
|
+
cfd->ioptions().allow_ingest_behind) {
|
|
4371
4372
|
continue;
|
|
4372
4373
|
}
|
|
4373
4374
|
new_bottommost_files_mark_threshold = std::min(
|
|
@@ -4446,7 +4447,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
|
|
|
4446
4447
|
// Add timestamp if needed
|
|
4447
4448
|
for (size_t i = 0; i < n; i++) {
|
|
4448
4449
|
auto [start, limit] = MaybeAddTimestampsToRange(
|
|
4449
|
-
|
|
4450
|
+
range[i].start, range[i].limit, ts_sz, &keys.emplace_back(),
|
|
4450
4451
|
&keys.emplace_back(), /*exclusive_end=*/false);
|
|
4451
4452
|
assert(start.has_value());
|
|
4452
4453
|
assert(limit.has_value());
|
|
@@ -4463,6 +4464,29 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
|
|
|
4463
4464
|
return s;
|
|
4464
4465
|
}
|
|
4465
4466
|
|
|
4467
|
+
Status DBImpl::GetPropertiesOfTablesByLevel(
|
|
4468
|
+
ColumnFamilyHandle* column_family,
|
|
4469
|
+
std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level) {
|
|
4470
|
+
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
4471
|
+
auto cfd = cfh->cfd();
|
|
4472
|
+
|
|
4473
|
+
// Increment the ref count
|
|
4474
|
+
mutex_.Lock();
|
|
4475
|
+
auto version = cfd->current();
|
|
4476
|
+
version->Ref();
|
|
4477
|
+
mutex_.Unlock();
|
|
4478
|
+
|
|
4479
|
+
const ReadOptions read_options;
|
|
4480
|
+
auto s = version->GetPropertiesOfTablesByLevel(read_options, props_by_level);
|
|
4481
|
+
|
|
4482
|
+
// Decrement the ref count
|
|
4483
|
+
mutex_.Lock();
|
|
4484
|
+
version->Unref();
|
|
4485
|
+
mutex_.Unlock();
|
|
4486
|
+
|
|
4487
|
+
return s;
|
|
4488
|
+
}
|
|
4489
|
+
|
|
4466
4490
|
const std::string& DBImpl::GetName() const { return dbname_; }
|
|
4467
4491
|
|
|
4468
4492
|
Env* DBImpl::GetEnv() const { return env_; }
|
|
@@ -4763,7 +4787,7 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
|
|
|
4763
4787
|
// Add timestamp if needed
|
|
4764
4788
|
std::string start_with_ts, limit_with_ts;
|
|
4765
4789
|
auto [start, limit] = MaybeAddTimestampsToRange(
|
|
4766
|
-
|
|
4790
|
+
range.start, range.limit, ts_sz, &start_with_ts, &limit_with_ts);
|
|
4767
4791
|
assert(start.has_value());
|
|
4768
4792
|
assert(limit.has_value());
|
|
4769
4793
|
// Convert user_key into a corresponding internal key.
|
|
@@ -4801,9 +4825,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
|
|
|
4801
4825
|
for (int i = 0; i < n; i++) {
|
|
4802
4826
|
// Add timestamp if needed
|
|
4803
4827
|
std::string start_with_ts, limit_with_ts;
|
|
4804
|
-
auto [start, limit] =
|
|
4805
|
-
|
|
4806
|
-
&start_with_ts, &limit_with_ts);
|
|
4828
|
+
auto [start, limit] = MaybeAddTimestampsToRange(
|
|
4829
|
+
range[i].start, range[i].limit, ts_sz, &start_with_ts, &limit_with_ts);
|
|
4807
4830
|
assert(start.has_value());
|
|
4808
4831
|
assert(limit.has_value());
|
|
4809
4832
|
// Convert user_key into a corresponding internal key.
|
|
@@ -4878,112 +4901,8 @@ Status DBImpl::GetUpdatesSince(
|
|
|
4878
4901
|
return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
|
|
4879
4902
|
}
|
|
4880
4903
|
|
|
4881
|
-
Status DBImpl::DeleteFile(std::string name) {
|
|
4882
|
-
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
4883
|
-
const ReadOptions read_options;
|
|
4884
|
-
const WriteOptions write_options;
|
|
4885
|
-
|
|
4886
|
-
uint64_t number;
|
|
4887
|
-
FileType type;
|
|
4888
|
-
WalFileType log_type;
|
|
4889
|
-
if (!ParseFileName(name, &number, &type, &log_type) ||
|
|
4890
|
-
(type != kTableFile && type != kWalFile)) {
|
|
4891
|
-
ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
|
|
4892
|
-
name.c_str());
|
|
4893
|
-
return Status::InvalidArgument("Invalid file name");
|
|
4894
|
-
}
|
|
4895
|
-
|
|
4896
|
-
if (type == kWalFile) {
|
|
4897
|
-
// Only allow deleting archived log files
|
|
4898
|
-
if (log_type != kArchivedLogFile) {
|
|
4899
|
-
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
|
|
4900
|
-
"DeleteFile %s failed - not archived log.\n",
|
|
4901
|
-
name.c_str());
|
|
4902
|
-
return Status::NotSupported("Delete only supported for archived logs");
|
|
4903
|
-
}
|
|
4904
|
-
Status status = wal_manager_.DeleteFile(name, number);
|
|
4905
|
-
if (!status.ok()) {
|
|
4906
|
-
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
|
|
4907
|
-
"DeleteFile %s failed -- %s.\n", name.c_str(),
|
|
4908
|
-
status.ToString().c_str());
|
|
4909
|
-
}
|
|
4910
|
-
return status;
|
|
4911
|
-
}
|
|
4912
|
-
|
|
4913
|
-
Status status;
|
|
4914
|
-
int level;
|
|
4915
|
-
FileMetaData* metadata;
|
|
4916
|
-
ColumnFamilyData* cfd;
|
|
4917
|
-
VersionEdit edit;
|
|
4918
|
-
JobContext job_context(next_job_id_.fetch_add(1), true);
|
|
4919
|
-
{
|
|
4920
|
-
InstrumentedMutexLock l(&mutex_);
|
|
4921
|
-
status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
|
|
4922
|
-
if (!status.ok()) {
|
|
4923
|
-
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
4924
|
-
"DeleteFile %s failed. File not found\n", name.c_str());
|
|
4925
|
-
job_context.Clean();
|
|
4926
|
-
return Status::InvalidArgument("File not found");
|
|
4927
|
-
}
|
|
4928
|
-
assert(level < cfd->NumberLevels());
|
|
4929
|
-
|
|
4930
|
-
// If the file is being compacted no need to delete.
|
|
4931
|
-
if (metadata->being_compacted) {
|
|
4932
|
-
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
4933
|
-
"DeleteFile %s Skipped. File about to be compacted\n",
|
|
4934
|
-
name.c_str());
|
|
4935
|
-
job_context.Clean();
|
|
4936
|
-
return Status::OK();
|
|
4937
|
-
}
|
|
4938
|
-
|
|
4939
|
-
// Only the files in the last level can be deleted externally.
|
|
4940
|
-
// This is to make sure that any deletion tombstones are not
|
|
4941
|
-
// lost. Check that the level passed is the last level.
|
|
4942
|
-
auto* vstoreage = cfd->current()->storage_info();
|
|
4943
|
-
for (int i = level + 1; i < cfd->NumberLevels(); i++) {
|
|
4944
|
-
if (vstoreage->NumLevelFiles(i) != 0) {
|
|
4945
|
-
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
4946
|
-
"DeleteFile %s FAILED. File not in last level\n",
|
|
4947
|
-
name.c_str());
|
|
4948
|
-
job_context.Clean();
|
|
4949
|
-
return Status::InvalidArgument("File not in last level");
|
|
4950
|
-
}
|
|
4951
|
-
}
|
|
4952
|
-
// if level == 0, it has to be the oldest file
|
|
4953
|
-
if (level == 0 &&
|
|
4954
|
-
vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
|
|
4955
|
-
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
4956
|
-
"DeleteFile %s failed ---"
|
|
4957
|
-
" target file in level 0 must be the oldest.",
|
|
4958
|
-
name.c_str());
|
|
4959
|
-
job_context.Clean();
|
|
4960
|
-
return Status::InvalidArgument("File in level 0, but not oldest");
|
|
4961
|
-
}
|
|
4962
|
-
edit.SetColumnFamily(cfd->GetID());
|
|
4963
|
-
edit.DeleteFile(level, number);
|
|
4964
|
-
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
|
|
4965
|
-
read_options, write_options, &edit, &mutex_,
|
|
4966
|
-
directories_.GetDbDir());
|
|
4967
|
-
if (status.ok()) {
|
|
4968
|
-
InstallSuperVersionAndScheduleWork(
|
|
4969
|
-
cfd, job_context.superversion_contexts.data(),
|
|
4970
|
-
*cfd->GetLatestMutableCFOptions());
|
|
4971
|
-
}
|
|
4972
|
-
FindObsoleteFiles(&job_context, false);
|
|
4973
|
-
} // lock released here
|
|
4974
|
-
|
|
4975
|
-
LogFlush(immutable_db_options_.info_log);
|
|
4976
|
-
// remove files outside the db-lock
|
|
4977
|
-
if (job_context.HaveSomethingToDelete()) {
|
|
4978
|
-
// Call PurgeObsoleteFiles() without holding mutex.
|
|
4979
|
-
PurgeObsoleteFiles(job_context);
|
|
4980
|
-
}
|
|
4981
|
-
job_context.Clean();
|
|
4982
|
-
return status;
|
|
4983
|
-
}
|
|
4984
|
-
|
|
4985
4904
|
Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
|
|
4986
|
-
const
|
|
4905
|
+
const RangeOpt* ranges, size_t n,
|
|
4987
4906
|
bool include_end) {
|
|
4988
4907
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
4989
4908
|
const ReadOptions read_options;
|
|
@@ -4995,7 +4914,7 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
|
|
|
4995
4914
|
const Comparator* ucmp = cfd->user_comparator();
|
|
4996
4915
|
assert(ucmp);
|
|
4997
4916
|
const size_t ts_sz = ucmp->timestamp_size();
|
|
4998
|
-
autovector<
|
|
4917
|
+
autovector<UserKeyRangeOpt> ukey_ranges;
|
|
4999
4918
|
std::vector<std::string> keys;
|
|
5000
4919
|
std::vector<Slice> key_slices;
|
|
5001
4920
|
ukey_ranges.reserve(n);
|
|
@@ -5005,8 +4924,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
|
|
|
5005
4924
|
auto [start, limit] = MaybeAddTimestampsToRange(
|
|
5006
4925
|
ranges[i].start, ranges[i].limit, ts_sz, &keys.emplace_back(),
|
|
5007
4926
|
&keys.emplace_back(), !include_end);
|
|
5008
|
-
assert(
|
|
5009
|
-
assert(
|
|
4927
|
+
assert(ranges[i].start.has_value() == start.has_value());
|
|
4928
|
+
assert(ranges[i].limit.has_value() == limit.has_value());
|
|
5010
4929
|
ukey_ranges.emplace_back(start, limit);
|
|
5011
4930
|
}
|
|
5012
4931
|
|
|
@@ -5066,21 +4985,19 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
|
|
|
5066
4985
|
}
|
|
5067
4986
|
}
|
|
5068
4987
|
if (!deleted_files.empty()) {
|
|
5069
|
-
vstorage->ComputeCompactionScore(
|
|
5070
|
-
|
|
4988
|
+
vstorage->ComputeCompactionScore(cfd->ioptions(),
|
|
4989
|
+
cfd->GetLatestMutableCFOptions());
|
|
5071
4990
|
}
|
|
5072
4991
|
if (edit.GetDeletedFiles().empty()) {
|
|
5073
4992
|
job_context.Clean();
|
|
5074
4993
|
return status;
|
|
5075
4994
|
}
|
|
5076
4995
|
input_version->Ref();
|
|
5077
|
-
status = versions_->LogAndApply(cfd,
|
|
5078
|
-
|
|
5079
|
-
directories_.GetDbDir());
|
|
4996
|
+
status = versions_->LogAndApply(cfd, read_options, write_options, &edit,
|
|
4997
|
+
&mutex_, directories_.GetDbDir());
|
|
5080
4998
|
if (status.ok()) {
|
|
5081
4999
|
InstallSuperVersionAndScheduleWork(
|
|
5082
|
-
cfd, job_context.superversion_contexts.data()
|
|
5083
|
-
*cfd->GetLatestMutableCFOptions());
|
|
5000
|
+
cfd, job_context.superversion_contexts.data());
|
|
5084
5001
|
}
|
|
5085
5002
|
for (auto* deleted_file : deleted_files) {
|
|
5086
5003
|
deleted_file->being_compacted = false;
|
|
@@ -5114,7 +5031,6 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
|
|
|
5114
5031
|
assert(column_family);
|
|
5115
5032
|
auto* cfd =
|
|
5116
5033
|
static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
|
|
5117
|
-
auto* sv = GetAndRefSuperVersion(cfd);
|
|
5118
5034
|
{
|
|
5119
5035
|
// Without mutex, Version::GetColumnFamilyMetaData will have data race
|
|
5120
5036
|
// with Compaction::MarkFilesBeingCompacted. One solution is to use mutex,
|
|
@@ -5126,9 +5042,8 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
|
|
|
5126
5042
|
// DB::GetColumnFamilyMetaData is not called frequently, the regression
|
|
5127
5043
|
// should not be big. We still need to keep an eye on it.
|
|
5128
5044
|
InstrumentedMutexLock l(&mutex_);
|
|
5129
|
-
|
|
5045
|
+
cfd->current()->GetColumnFamilyMetaData(cf_meta);
|
|
5130
5046
|
}
|
|
5131
|
-
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
5132
5047
|
}
|
|
5133
5048
|
|
|
5134
5049
|
void DBImpl::GetAllColumnFamilyMetaData(
|
|
@@ -5557,12 +5472,7 @@ Status DBImpl::WriteOptionsFile(const WriteOptions& write_options,
|
|
|
5557
5472
|
if (!s.ok()) {
|
|
5558
5473
|
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
|
5559
5474
|
"Unnable to persist options -- %s", s.ToString().c_str());
|
|
5560
|
-
|
|
5561
|
-
s = Status::IOError("Unable to persist options.", s.ToString().c_str());
|
|
5562
|
-
} else {
|
|
5563
|
-
// Ignore error
|
|
5564
|
-
s = Status::OK();
|
|
5565
|
-
}
|
|
5475
|
+
s = Status::IOError("Unable to persist options.", s.ToString().c_str());
|
|
5566
5476
|
}
|
|
5567
5477
|
|
|
5568
5478
|
// Restore lock if appropriate
|
|
@@ -5679,7 +5589,7 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name,
|
|
|
5679
5589
|
void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
|
|
5680
5590
|
if (immutable_db_options_.enable_thread_tracking) {
|
|
5681
5591
|
ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
|
|
5682
|
-
cfd->ioptions()
|
|
5592
|
+
cfd->ioptions().env);
|
|
5683
5593
|
}
|
|
5684
5594
|
}
|
|
5685
5595
|
|
|
@@ -5897,6 +5807,7 @@ Status DBImpl::IngestExternalFile(
|
|
|
5897
5807
|
|
|
5898
5808
|
Status DBImpl::IngestExternalFiles(
|
|
5899
5809
|
const std::vector<IngestExternalFileArg>& args) {
|
|
5810
|
+
PERF_TIMER_GUARD(file_ingestion_nanos);
|
|
5900
5811
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
5901
5812
|
const WriteOptions write_options;
|
|
5902
5813
|
|
|
@@ -5943,6 +5854,27 @@ Status DBImpl::IngestExternalFiles(
|
|
|
5943
5854
|
"timestamps enabled doesn't support ingest behind.");
|
|
5944
5855
|
}
|
|
5945
5856
|
}
|
|
5857
|
+
if (arg.atomic_replace_range.has_value()) {
|
|
5858
|
+
if (ingest_opts.ingest_behind) {
|
|
5859
|
+
return Status::InvalidArgument(
|
|
5860
|
+
"Can't combine atomic_replace_range with ingest_behind.");
|
|
5861
|
+
}
|
|
5862
|
+
if (ingest_opts.snapshot_consistency) {
|
|
5863
|
+
// TODO: support generating and ingesting a big tombstone file, which
|
|
5864
|
+
// might depend on non-nullptr start and limit
|
|
5865
|
+
return Status::NotSupported(
|
|
5866
|
+
"atomic_replace_range not yet supported with "
|
|
5867
|
+
"snapshot_consistency.");
|
|
5868
|
+
} else {
|
|
5869
|
+
if (arg.atomic_replace_range->start.has_value() ^
|
|
5870
|
+
arg.atomic_replace_range->limit.has_value()) {
|
|
5871
|
+
return Status::NotSupported(
|
|
5872
|
+
"Only one of atomic_replace_range.{start,limit}.has_value() is "
|
|
5873
|
+
"not supported.");
|
|
5874
|
+
}
|
|
5875
|
+
}
|
|
5876
|
+
}
|
|
5877
|
+
|
|
5946
5878
|
if (ingest_opts.allow_db_generated_files) {
|
|
5947
5879
|
if (ingest_opts.write_global_seqno) {
|
|
5948
5880
|
return Status::NotSupported(
|
|
@@ -5991,8 +5923,8 @@ Status DBImpl::IngestExternalFiles(
|
|
|
5991
5923
|
this);
|
|
5992
5924
|
Status es = ingestion_jobs[i].Prepare(
|
|
5993
5925
|
args[i].external_files, args[i].files_checksums,
|
|
5994
|
-
args[i].files_checksum_func_names, args[i].
|
|
5995
|
-
start_file_number, super_version);
|
|
5926
|
+
args[i].files_checksum_func_names, args[i].atomic_replace_range,
|
|
5927
|
+
args[i].file_temperature, start_file_number, super_version);
|
|
5996
5928
|
// capture first error only
|
|
5997
5929
|
if (!es.ok() && status.ok()) {
|
|
5998
5930
|
status = es;
|
|
@@ -6007,8 +5939,8 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6007
5939
|
this);
|
|
6008
5940
|
Status es = ingestion_jobs[0].Prepare(
|
|
6009
5941
|
args[0].external_files, args[0].files_checksums,
|
|
6010
|
-
args[0].files_checksum_func_names, args[0].
|
|
6011
|
-
next_file_number, super_version);
|
|
5942
|
+
args[0].files_checksum_func_names, args[0].atomic_replace_range,
|
|
5943
|
+
args[0].file_temperature, next_file_number, super_version);
|
|
6012
5944
|
if (!es.ok()) {
|
|
6013
5945
|
status = es;
|
|
6014
5946
|
}
|
|
@@ -6041,6 +5973,7 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6041
5973
|
if (two_write_queues_) {
|
|
6042
5974
|
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
|
6043
5975
|
}
|
|
5976
|
+
PERF_TIMER_GUARD(file_ingestion_blocking_live_writes_nanos);
|
|
6044
5977
|
|
|
6045
5978
|
// When unordered_write is enabled, the keys are writing to memtable in an
|
|
6046
5979
|
// unordered way. If the ingestion job checks memtable key range before the
|
|
@@ -6051,6 +5984,7 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6051
5984
|
|
|
6052
5985
|
num_running_ingest_file_ += static_cast<int>(num_cfs);
|
|
6053
5986
|
TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
|
|
5987
|
+
TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter:2");
|
|
6054
5988
|
|
|
6055
5989
|
bool at_least_one_cf_need_flush = false;
|
|
6056
5990
|
std::vector<bool> need_flush(num_cfs, false);
|
|
@@ -6121,14 +6055,12 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6121
6055
|
ReadOptions read_options;
|
|
6122
6056
|
read_options.fill_cache = args[0].options.fill_cache;
|
|
6123
6057
|
autovector<ColumnFamilyData*> cfds_to_commit;
|
|
6124
|
-
autovector<const MutableCFOptions*> mutable_cf_options_list;
|
|
6125
6058
|
autovector<autovector<VersionEdit*>> edit_lists;
|
|
6126
6059
|
uint32_t num_entries = 0;
|
|
6127
6060
|
for (size_t i = 0; i != num_cfs; ++i) {
|
|
6128
6061
|
auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
|
|
6129
6062
|
assert(!cfd->IsDropped());
|
|
6130
6063
|
cfds_to_commit.push_back(cfd);
|
|
6131
|
-
mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
|
|
6132
6064
|
autovector<VersionEdit*> edit_list;
|
|
6133
6065
|
edit_list.push_back(ingestion_jobs[i].edit());
|
|
6134
6066
|
edit_lists.push_back(edit_list);
|
|
@@ -6143,10 +6075,10 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6143
6075
|
}
|
|
6144
6076
|
assert(0 == num_entries);
|
|
6145
6077
|
}
|
|
6146
|
-
status =
|
|
6147
|
-
cfds_to_commit,
|
|
6078
|
+
status =
|
|
6079
|
+
versions_->LogAndApply(cfds_to_commit, read_options, write_options,
|
|
6148
6080
|
|
|
6149
|
-
|
|
6081
|
+
edit_lists, &mutex_, directories_.GetDbDir());
|
|
6150
6082
|
// It is safe to update VersionSet last seqno here after LogAndApply since
|
|
6151
6083
|
// LogAndApply persists last sequence number from VersionEdits,
|
|
6152
6084
|
// which are from file's largest seqno and not from VersionSet.
|
|
@@ -6178,8 +6110,7 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6178
6110
|
for (size_t i = 0; i != num_cfs; ++i) {
|
|
6179
6111
|
auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
|
|
6180
6112
|
assert(!cfd->IsDropped());
|
|
6181
|
-
InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i]
|
|
6182
|
-
*cfd->GetLatestMutableCFOptions());
|
|
6113
|
+
InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i]);
|
|
6183
6114
|
#ifndef NDEBUG
|
|
6184
6115
|
if (0 == i && num_cfs > 1) {
|
|
6185
6116
|
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
|
|
@@ -6203,6 +6134,7 @@ Status DBImpl::IngestExternalFiles(
|
|
|
6203
6134
|
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
|
6204
6135
|
}
|
|
6205
6136
|
write_thread_.ExitUnbatched(&w);
|
|
6137
|
+
PERF_TIMER_STOP(file_ingestion_blocking_live_writes_nanos);
|
|
6206
6138
|
|
|
6207
6139
|
if (status.ok()) {
|
|
6208
6140
|
for (auto& job : ingestion_jobs) {
|
|
@@ -6297,12 +6229,11 @@ Status DBImpl::CreateColumnFamilyWithImport(
|
|
|
6297
6229
|
// and this will overwrite the external file. To protect the external
|
|
6298
6230
|
// file, we have to make sure the file number will never being reused.
|
|
6299
6231
|
next_file_number = versions_->FetchAddFileNumber(total_file_num);
|
|
6300
|
-
auto cf_options = cfd->GetLatestMutableCFOptions();
|
|
6301
6232
|
status =
|
|
6302
|
-
versions_->LogAndApply(cfd,
|
|
6303
|
-
&
|
|
6233
|
+
versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit,
|
|
6234
|
+
&mutex_, directories_.GetDbDir());
|
|
6304
6235
|
if (status.ok()) {
|
|
6305
|
-
|
|
6236
|
+
InstallSuperVersionForConfigChange(cfd, &dummy_sv_ctx);
|
|
6306
6237
|
}
|
|
6307
6238
|
}
|
|
6308
6239
|
}
|
|
@@ -6335,12 +6266,11 @@ Status DBImpl::CreateColumnFamilyWithImport(
|
|
|
6335
6266
|
|
|
6336
6267
|
// Install job edit [Mutex will be unlocked here]
|
|
6337
6268
|
if (status.ok()) {
|
|
6338
|
-
|
|
6339
|
-
|
|
6340
|
-
|
|
6341
|
-
&mutex_, directories_.GetDbDir());
|
|
6269
|
+
status = versions_->LogAndApply(cfd, read_options, write_options,
|
|
6270
|
+
import_job.edit(), &mutex_,
|
|
6271
|
+
directories_.GetDbDir());
|
|
6342
6272
|
if (status.ok()) {
|
|
6343
|
-
|
|
6273
|
+
InstallSuperVersionForConfigChange(cfd, &sv_context);
|
|
6344
6274
|
}
|
|
6345
6275
|
}
|
|
6346
6276
|
|
|
@@ -6401,9 +6331,9 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family,
|
|
|
6401
6331
|
|
|
6402
6332
|
if (status.ok()) {
|
|
6403
6333
|
// DeleteFilesInRanges non-overlap files except L0
|
|
6404
|
-
std::vector<
|
|
6405
|
-
ranges.emplace_back(
|
|
6406
|
-
ranges.emplace_back(
|
|
6334
|
+
std::vector<RangeOpt> ranges;
|
|
6335
|
+
ranges.emplace_back(OptSlice{}, begin_key);
|
|
6336
|
+
ranges.emplace_back(end_key, OptSlice{});
|
|
6407
6337
|
status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size());
|
|
6408
6338
|
}
|
|
6409
6339
|
|
|
@@ -6541,7 +6471,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
|
|
|
6541
6471
|
const auto& fd = fd_with_krange.fd;
|
|
6542
6472
|
const FileMetaData* fmeta = fd_with_krange.file_metadata;
|
|
6543
6473
|
assert(fmeta);
|
|
6544
|
-
std::string fname = TableFileName(cfd->ioptions()
|
|
6474
|
+
std::string fname = TableFileName(cfd->ioptions().cf_paths,
|
|
6545
6475
|
fd.GetNumber(), fd.GetPathId());
|
|
6546
6476
|
if (use_file_checksum) {
|
|
6547
6477
|
s = VerifyFullFileChecksum(fmeta->file_checksum,
|
|
@@ -6565,7 +6495,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
|
|
|
6565
6495
|
const uint64_t blob_file_number = meta->GetBlobFileNumber();
|
|
6566
6496
|
|
|
6567
6497
|
const std::string blob_file_name = BlobFileName(
|
|
6568
|
-
cfd->ioptions()
|
|
6498
|
+
cfd->ioptions().cf_paths.front().path, blob_file_number);
|
|
6569
6499
|
s = VerifyFullFileChecksum(meta->GetChecksumValue(),
|
|
6570
6500
|
meta->GetChecksumMethod(), blob_file_name,
|
|
6571
6501
|
read_options);
|
|
@@ -6758,16 +6688,15 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion(
|
|
|
6758
6688
|
pending_output_elem.reset(new std::list<uint64_t>::iterator(
|
|
6759
6689
|
CaptureCurrentFileNumberInPendingOutputs()));
|
|
6760
6690
|
*next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
|
|
6761
|
-
auto cf_options = cfd->GetLatestMutableCFOptions();
|
|
6762
6691
|
VersionEdit dummy_edit;
|
|
6763
6692
|
// If crash happen after a hard link established, Recover function may
|
|
6764
6693
|
// reuse the file number that has already assigned to the internal file,
|
|
6765
6694
|
// and this will overwrite the external file. To protect the external
|
|
6766
6695
|
// file, we have to make sure the file number will never being reused.
|
|
6767
|
-
s = versions_->LogAndApply(cfd,
|
|
6768
|
-
&
|
|
6696
|
+
s = versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit,
|
|
6697
|
+
&mutex_, directories_.GetDbDir());
|
|
6769
6698
|
if (s.ok()) {
|
|
6770
|
-
InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx
|
|
6699
|
+
InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx);
|
|
6771
6700
|
}
|
|
6772
6701
|
dummy_sv_ctx.Clean();
|
|
6773
6702
|
return s;
|
|
@@ -6801,60 +6730,199 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
|
|
|
6801
6730
|
}
|
|
6802
6731
|
}
|
|
6803
6732
|
|
|
6804
|
-
|
|
6733
|
+
std::pair<SequenceNumber, uint64_t> DBImpl::GetSeqnoToTimeSample() const {
|
|
6805
6734
|
// TECHNICALITY: Sample last sequence number *before* time, as prescribed
|
|
6806
6735
|
// for SeqnoToTimeMapping. We don't know how long it has been since the last
|
|
6807
6736
|
// sequence number was written, so we at least have a one-sided bound by
|
|
6808
6737
|
// sampling in this order.
|
|
6738
|
+
// ALSO, to avoid out-of-order mappings, we need to get the seqno and times
|
|
6739
|
+
// while holding the DB mutex. (This is really to make testing happy because
|
|
6740
|
+
// it's fine to throw out extra close-but-not-quite-consistent mappings in
|
|
6741
|
+
// production.)
|
|
6742
|
+
mutex_.AssertHeld();
|
|
6809
6743
|
SequenceNumber seqno = GetLatestSequenceNumber();
|
|
6744
|
+
// HACK/TODO: seqno might be zero but we can't record a mapping for that.
|
|
6745
|
+
// Start with 1, which should be close enough.
|
|
6746
|
+
seqno = std::max(seqno, SequenceNumber{1});
|
|
6810
6747
|
int64_t unix_time_signed = 0;
|
|
6811
6748
|
immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
|
|
6812
6749
|
.PermitUncheckedError(); // Ignore error
|
|
6813
|
-
|
|
6750
|
+
return {seqno, static_cast<uint64_t>(unix_time_signed)};
|
|
6751
|
+
}
|
|
6814
6752
|
|
|
6815
|
-
|
|
6816
|
-
|
|
6817
|
-
|
|
6818
|
-
|
|
6819
|
-
|
|
6820
|
-
|
|
6821
|
-
|
|
6822
|
-
|
|
6823
|
-
|
|
6824
|
-
|
|
6825
|
-
|
|
6826
|
-
|
|
6827
|
-
|
|
6828
|
-
|
|
6829
|
-
|
|
6830
|
-
|
|
6831
|
-
|
|
6832
|
-
|
|
6833
|
-
|
|
6834
|
-
|
|
6835
|
-
ROCKS_LOG_INFO(
|
|
6836
|
-
immutable_db_options_.info_log,
|
|
6837
|
-
"Pre-populated sequence number to time entries: [1,%" PRIu64
|
|
6838
|
-
"] -> [%" PRIu64 ",%" PRIu64 "]",
|
|
6839
|
-
seqno, unix_time - populate_historical_seconds, unix_time);
|
|
6840
|
-
} else {
|
|
6841
|
-
ROCKS_LOG_WARN(
|
|
6842
|
-
immutable_db_options_.info_log,
|
|
6843
|
-
"Failed to pre-populate sequence number to time entries: [1,%" PRIu64
|
|
6844
|
-
"] -> [%" PRIu64 ",%" PRIu64 "]",
|
|
6845
|
-
seqno, unix_time - populate_historical_seconds, unix_time);
|
|
6846
|
-
}
|
|
6753
|
+
void DBImpl::EnsureSeqnoToTimeMapping(
|
|
6754
|
+
const MinAndMaxPreserveSeconds& preserve_info) {
|
|
6755
|
+
mutex_.AssertHeld();
|
|
6756
|
+
assert(preserve_info.IsEnabled());
|
|
6757
|
+
|
|
6758
|
+
// Atomically with CF creation or mutable option change (see
|
|
6759
|
+
// InstallSuperVersionForConfigChange()), we need to be sure any data written
|
|
6760
|
+
// after setting preserve/preclude options must have a reasonable time
|
|
6761
|
+
// estimate (so that we can accurately place the data), which means at least
|
|
6762
|
+
// one entry in seqno_to_time_mapping_. It's not critical that `preserve_info`
|
|
6763
|
+
// take into account all CFs, as that's mostly relevant to how we add
|
|
6764
|
+
// recurring entries and purge old ones.
|
|
6765
|
+
|
|
6766
|
+
auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
|
|
6767
|
+
// Ensure at least one sample that is sufficiently recent
|
|
6768
|
+
uint64_t unix_time_last_sample = 0;
|
|
6769
|
+
if (seqno_to_time_mapping_.Empty()) {
|
|
6770
|
+
// The exact best settings will be found and applied in
|
|
6771
|
+
// RegisterRecordSeqnoTimeWorker()
|
|
6772
|
+
seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries);
|
|
6847
6773
|
} else {
|
|
6848
|
-
|
|
6849
|
-
|
|
6774
|
+
unix_time_last_sample =
|
|
6775
|
+
seqno_to_time_mapping_.GetProximalTimeBeforeSeqno(kMaxSequenceNumber);
|
|
6776
|
+
}
|
|
6777
|
+
uint64_t cadence = preserve_info.GetRecodingCadence();
|
|
6778
|
+
// Extend cadence so as to avoid stepping on toes of recorder job, which
|
|
6779
|
+
// could lag a bit.
|
|
6780
|
+
cadence += 3 + cadence / 100;
|
|
6781
|
+
if (unix_time_now >= cadence &&
|
|
6782
|
+
unix_time_last_sample <= unix_time_now - cadence) {
|
|
6783
|
+
assert(seqno > 0); // See GetSeqnoToTimeSample()
|
|
6850
6784
|
// Always successful assuming seqno never go backwards
|
|
6851
|
-
seqno_to_time_mapping_.Append(seqno,
|
|
6852
|
-
|
|
6785
|
+
seqno_to_time_mapping_.Append(seqno, unix_time_now);
|
|
6786
|
+
}
|
|
6787
|
+
}
|
|
6788
|
+
|
|
6789
|
+
void DBImpl::PrepopulateSeqnoToTimeMapping(
|
|
6790
|
+
const MinAndMaxPreserveSeconds& preserve_info) {
|
|
6791
|
+
// Only for opening a new DB, with preserve/preclude options set
|
|
6792
|
+
if (!preserve_info.IsEnabled()) {
|
|
6793
|
+
assert(false);
|
|
6794
|
+
return;
|
|
6795
|
+
}
|
|
6796
|
+
if (GetLatestSequenceNumber() != 0) {
|
|
6797
|
+
assert(false);
|
|
6798
|
+
return;
|
|
6853
6799
|
}
|
|
6854
6800
|
|
|
6855
|
-
//
|
|
6856
|
-
|
|
6857
|
-
|
|
6801
|
+
// Here we fulfill the following promise:
|
|
6802
|
+
//
|
|
6803
|
+
// Any DB/CF created with preserve/preclude options set from the beginning
|
|
6804
|
+
// will get pre-allocated seqnos with pre-populated time mappings back to
|
|
6805
|
+
// the times we are interested in. (This will enable future import of data
|
|
6806
|
+
// while preserving rough write time. We can only do this reliably from
|
|
6807
|
+
// DB::Open, as otherwise there could be a race between CreateColumnFamily
|
|
6808
|
+
// and the first Write to the DB, and seqno-to-time mappings need to be
|
|
6809
|
+
// monotonic.
|
|
6810
|
+
//
|
|
6811
|
+
// FIXME: We don't currently guarantee that if the first column family with
|
|
6812
|
+
// that setting is added or configured after initial DB::Open but before
|
|
6813
|
+
// the first user Write. Fixing this causes complications with the crash
|
|
6814
|
+
// test because if DB starts without preserve/preclude option, does some
|
|
6815
|
+
// user writes but all those writes are lost in crash, then re-opens with
|
|
6816
|
+
// preserve/preclude option, it sees seqno==1 which looks like one of the
|
|
6817
|
+
// user writes was recovered, when actually it was not.
|
|
6818
|
+
|
|
6819
|
+
// Pre-allocate seqnos and pre-populate historical mapping
|
|
6820
|
+
// We can simply modify these, before writes are allowed
|
|
6821
|
+
constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
|
|
6822
|
+
versions_->SetLastAllocatedSequence(kMax);
|
|
6823
|
+
versions_->SetLastPublishedSequence(kMax);
|
|
6824
|
+
versions_->SetLastSequence(kMax);
|
|
6825
|
+
|
|
6826
|
+
// And record in manifest, to avoid going backwards in seqno on re-open
|
|
6827
|
+
// (potentially with different options). Concurrency is simple because we
|
|
6828
|
+
// are in DB::Open
|
|
6829
|
+
const WriteOptions write_options(Env::IOActivity::kDBOpen);
|
|
6830
|
+
const ReadOptions read_options(Env::IOActivity::kDBOpen);
|
|
6831
|
+
VersionEdit edit;
|
|
6832
|
+
edit.SetLastSequence(kMax);
|
|
6833
|
+
Status s = versions_->LogAndApplyToDefaultColumnFamily(
|
|
6834
|
+
read_options, write_options, &edit, &mutex_, directories_.GetDbDir());
|
|
6835
|
+
if (!s.ok() && versions_->io_status().IsIOError()) {
|
|
6836
|
+
error_handler_.SetBGError(versions_->io_status(),
|
|
6837
|
+
BackgroundErrorReason::kManifestWrite);
|
|
6838
|
+
}
|
|
6839
|
+
|
|
6840
|
+
auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
|
|
6841
|
+
uint64_t populate_historical_seconds = preserve_info.max_preserve_seconds;
|
|
6842
|
+
if (seqno > 1 && unix_time_now > populate_historical_seconds) {
|
|
6843
|
+
// seqno=0 is reserved
|
|
6844
|
+
SequenceNumber from_seqno = 1;
|
|
6845
|
+
seqno_to_time_mapping_.PrePopulate(
|
|
6846
|
+
from_seqno, seqno, unix_time_now - populate_historical_seconds,
|
|
6847
|
+
unix_time_now);
|
|
6848
|
+
} else {
|
|
6849
|
+
// One of these will fail
|
|
6850
|
+
assert(seqno > 1);
|
|
6851
|
+
assert(unix_time_now > populate_historical_seconds);
|
|
6852
|
+
}
|
|
6853
|
+
}
|
|
6854
|
+
|
|
6855
|
+
void DBImpl::InstallSuperVersionForConfigChange(
|
|
6856
|
+
ColumnFamilyData* cfd, SuperVersionContext* sv_context) {
|
|
6857
|
+
MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
|
|
6858
|
+
std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping;
|
|
6859
|
+
if (preserve_info.IsEnabled()) {
|
|
6860
|
+
// TODO: detect & optimize if mapping hasn't changed from previous
|
|
6861
|
+
// SuperVersion
|
|
6862
|
+
EnsureSeqnoToTimeMapping(preserve_info);
|
|
6863
|
+
new_seqno_to_time_mapping = std::make_shared<SeqnoToTimeMapping>();
|
|
6864
|
+
new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
|
|
6865
|
+
}
|
|
6866
|
+
InstallSuperVersionAndScheduleWork(cfd, sv_context,
|
|
6867
|
+
std::move(new_seqno_to_time_mapping));
|
|
6868
|
+
}
|
|
6869
|
+
|
|
6870
|
+
void DBImpl::RecordSeqnoToTimeMapping() {
|
|
6871
|
+
SuperVersionContext sv_context;
|
|
6872
|
+
{
|
|
6873
|
+
InstrumentedMutexLock l(&mutex_);
|
|
6874
|
+
// Record next sample
|
|
6875
|
+
seqno_to_time_mapping_.Append(GetSeqnoToTimeSample());
|
|
6876
|
+
// Create an immutable snapshot for sharing across CFs
|
|
6877
|
+
std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
|
|
6878
|
+
std::make_shared<SeqnoToTimeMapping>();
|
|
6879
|
+
new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
|
|
6880
|
+
|
|
6881
|
+
// Update in SV of all applicable CFs
|
|
6882
|
+
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
|
6883
|
+
if (cfd->IsDropped()) {
|
|
6884
|
+
continue;
|
|
6885
|
+
}
|
|
6886
|
+
MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
|
|
6887
|
+
if (preserve_info.IsEnabled()) {
|
|
6888
|
+
sv_context.NewSuperVersion();
|
|
6889
|
+
cfd->InstallSuperVersion(&sv_context, &mutex_,
|
|
6890
|
+
new_seqno_to_time_mapping);
|
|
6891
|
+
}
|
|
6892
|
+
}
|
|
6893
|
+
bg_cv_.SignalAll();
|
|
6894
|
+
}
|
|
6895
|
+
|
|
6896
|
+
// clean up & report outside db mutex
|
|
6897
|
+
sv_context.Clean();
|
|
6898
|
+
}
|
|
6899
|
+
|
|
6900
|
+
void DBImpl::TriggerPeriodicCompaction() {
|
|
6901
|
+
TEST_SYNC_POINT("DBImpl::TriggerPeriodicCompaction:StartRunning");
|
|
6902
|
+
{
|
|
6903
|
+
InstrumentedMutexLock l(&mutex_);
|
|
6904
|
+
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
6905
|
+
"Running the periodic task to trigger compactions.");
|
|
6906
|
+
|
|
6907
|
+
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
|
6908
|
+
if (cfd->IsDropped()) {
|
|
6909
|
+
continue;
|
|
6910
|
+
}
|
|
6911
|
+
if (cfd->GetLatestCFOptions().periodic_compaction_seconds &&
|
|
6912
|
+
!cfd->queued_for_compaction()) {
|
|
6913
|
+
cfd->current()->storage_info()->ComputeCompactionScore(
|
|
6914
|
+
cfd->ioptions(), cfd->GetLatestMutableCFOptions());
|
|
6915
|
+
EnqueuePendingCompaction(cfd);
|
|
6916
|
+
if (cfd->queued_for_compaction()) {
|
|
6917
|
+
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
6918
|
+
"Periodic task to trigger compaction queued Column "
|
|
6919
|
+
"family [%s] for compaction.",
|
|
6920
|
+
cfd->GetName().c_str());
|
|
6921
|
+
}
|
|
6922
|
+
}
|
|
6923
|
+
}
|
|
6924
|
+
MaybeScheduleFlushOrCompaction();
|
|
6925
|
+
bg_cv_.SignalAll();
|
|
6858
6926
|
}
|
|
6859
6927
|
}
|
|
6860
6928
|
|
|
@@ -6914,22 +6982,4 @@ void DBImpl::TrackOrUntrackFiles(
|
|
|
6914
6982
|
}
|
|
6915
6983
|
}
|
|
6916
6984
|
|
|
6917
|
-
void DBImpl::InstallSeqnoToTimeMappingInSV(
|
|
6918
|
-
std::vector<SuperVersionContext>* sv_contexts) {
|
|
6919
|
-
mutex_.AssertHeld();
|
|
6920
|
-
std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
|
|
6921
|
-
std::make_shared<SeqnoToTimeMapping>();
|
|
6922
|
-
new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
|
|
6923
|
-
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
|
6924
|
-
if (cfd->IsDropped()) {
|
|
6925
|
-
continue;
|
|
6926
|
-
}
|
|
6927
|
-
sv_contexts->emplace_back(/*create_superversion=*/true);
|
|
6928
|
-
sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping;
|
|
6929
|
-
cfd->InstallSuperVersion(&sv_contexts->back(),
|
|
6930
|
-
*(cfd->GetLatestMutableCFOptions()));
|
|
6931
|
-
}
|
|
6932
|
-
bg_cv_.SignalAll();
|
|
6933
|
-
}
|
|
6934
|
-
|
|
6935
6985
|
} // namespace ROCKSDB_NAMESPACE
|