@nxtedition/rocksdb 13.5.7 → 13.5.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +248 -70
- package/binding.gyp +2 -2
- package/deps/rocksdb/rocksdb/BUCK +12 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -0
- package/deps/rocksdb/rocksdb/Makefile +28 -23
- package/deps/rocksdb/rocksdb/cache/cache.cc +0 -1
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +43 -39
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +0 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +2 -3
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +1 -3
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +11 -1
- package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +13 -5
- package/deps/rocksdb/rocksdb/crash_test.mk +61 -15
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +136 -45
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +34 -16
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +10 -7
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +1 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +12 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +2 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/builder.cc +22 -8
- package/deps/rocksdb/rocksdb/db/builder.h +5 -4
- package/deps/rocksdb/rocksdb/db/c.cc +556 -15
- package/deps/rocksdb/rocksdb/db/c_test.c +133 -12
- package/deps/rocksdb/rocksdb/db/column_family.cc +114 -50
- package/deps/rocksdb/rocksdb/db/column_family.h +53 -36
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +95 -70
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +71 -51
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -86
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +26 -68
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +0 -122
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +453 -258
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +117 -92
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +38 -38
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +24 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +34 -45
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -31
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +10 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +82 -34
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +267 -179
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +4 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +273 -89
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +300 -14
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +4 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +28 -23
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +69 -51
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +522 -245
- package/deps/rocksdb/rocksdb/db/convenience.cc +15 -4
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +196 -17
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +74 -62
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +48 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +682 -250
- package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +11 -16
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +57 -0
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +540 -490
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +347 -188
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +584 -217
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +13 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -7
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +40 -36
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +751 -372
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +35 -32
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +24 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +125 -63
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +2 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +311 -196
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +15 -5
- package/deps/rocksdb/rocksdb/db/db_iter.cc +42 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +96 -31
- package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/db_iter_test.cc +168 -228
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +454 -0
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +8 -8
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +90 -0
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +60 -2
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +7 -3
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +85 -27
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +114 -2
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +51 -3
- package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_test.cc +325 -18
- package/deps/rocksdb/rocksdb/db/db_test2.cc +644 -20
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
- package/deps/rocksdb/rocksdb/db/db_test_util.h +9 -0
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +64 -45
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +203 -14
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +259 -30
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +75 -1
- package/deps/rocksdb/rocksdb/db/dbformat.h +70 -6
- package/deps/rocksdb/rocksdb/db/deletefile_test.cc +0 -190
- package/deps/rocksdb/rocksdb/db/error_handler.cc +22 -7
- package/deps/rocksdb/rocksdb/db/error_handler.h +16 -1
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +41 -26
- package/deps/rocksdb/rocksdb/db/experimental.cc +4 -3
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +464 -78
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +166 -69
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +54 -25
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/flush_job.cc +98 -81
- package/deps/rocksdb/rocksdb/db/flush_job.h +4 -9
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +80 -84
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +1 -1
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +2 -2
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +12 -19
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +0 -2
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +41 -15
- package/deps/rocksdb/rocksdb/db/internal_stats.h +63 -52
- package/deps/rocksdb/rocksdb/db/job_context.h +59 -24
- package/deps/rocksdb/rocksdb/db/listener_test.cc +69 -10
- package/deps/rocksdb/rocksdb/db/log_format.h +11 -2
- package/deps/rocksdb/rocksdb/db/log_reader.cc +147 -34
- package/deps/rocksdb/rocksdb/db/log_reader.h +40 -11
- package/deps/rocksdb/rocksdb/db/log_test.cc +16 -3
- package/deps/rocksdb/rocksdb/db/log_writer.cc +102 -55
- package/deps/rocksdb/rocksdb/db/log_writer.h +21 -2
- package/deps/rocksdb/rocksdb/db/malloc_stats.h +0 -2
- package/deps/rocksdb/rocksdb/db/memtable.cc +16 -47
- package/deps/rocksdb/rocksdb/db/memtable.h +76 -12
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +23 -20
- package/deps/rocksdb/rocksdb/db/memtable_list.h +9 -11
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +18 -37
- package/deps/rocksdb/rocksdb/db/merge_context.h +2 -1
- package/deps/rocksdb/rocksdb/db/merge_test.cc +8 -0
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +3 -5
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +15 -7
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +6 -3
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +22 -4
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +41 -1
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/repair.cc +29 -34
- package/deps/rocksdb/rocksdb/db/repair_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +14 -15
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +1 -3
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +47 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +1 -3
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_builder.cc +2 -2
- package/deps/rocksdb/rocksdb/db/version_edit.cc +8 -37
- package/deps/rocksdb/rocksdb/db/version_edit.h +32 -1
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +26 -18
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -5
- package/deps/rocksdb/rocksdb/db/version_set.cc +282 -197
- package/deps/rocksdb/rocksdb/db/version_set.h +54 -57
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +28 -35
- package/deps/rocksdb/rocksdb/db/version_util.h +2 -3
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +3 -2
- package/deps/rocksdb/rocksdb/db/wal_manager.h +0 -1
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +1 -0
- package/deps/rocksdb/rocksdb/db/write_batch.cc +22 -8
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +5 -4
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +7 -6
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/write_thread.h +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +13 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +9 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +39 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +65 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +45 -22
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +22 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +28 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -38
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +80 -32
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +51 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +23 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +305 -15
- package/deps/rocksdb/rocksdb/env/env.cc +32 -2
- package/deps/rocksdb/rocksdb/env/env_encryption.cc +0 -2
- package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +2 -4
- package/deps/rocksdb/rocksdb/env/env_posix.cc +4 -2
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +20 -11
- package/deps/rocksdb/rocksdb/env/fs_readonly.h +0 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +0 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.h +0 -2
- package/deps/rocksdb/rocksdb/env/io_posix.cc +6 -4
- package/deps/rocksdb/rocksdb/env/io_posix.h +3 -2
- package/deps/rocksdb/rocksdb/env/mock_env.cc +0 -1
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +2 -2
- package/deps/rocksdb/rocksdb/file/delete_scheduler.h +0 -2
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +0 -2
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +30 -21
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +16 -0
- package/deps/rocksdb/rocksdb/file/file_util.cc +32 -14
- package/deps/rocksdb/rocksdb/file/file_util.h +22 -5
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +229 -76
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +21 -12
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +10 -7
- package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +12 -8
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +1 -2
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +0 -2
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +598 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_iterator.h +36 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +70 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +232 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +149 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +17 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +132 -34
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +158 -79
- package/deps/rocksdb/rocksdb/include/rocksdb/db_bench_tool.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +1 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +275 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +50 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +10 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +5 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +13 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +237 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +230 -39
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_level.h +31 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +41 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +19 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +124 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +26 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +55 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +3 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index.h +96 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/secondary_index_faiss.h +117 -0
- package/deps/rocksdb/rocksdb/{utilities/secondary_index/faiss_ivf_index.h → include/rocksdb/utilities/secondary_index_simple.h} +11 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +26 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +16 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +63 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +0 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +28 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +3 -3
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +0 -2
- package/deps/rocksdb/rocksdb/logging/event_logger_test.cc +1 -2
- package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +1 -1
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +0 -1
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +0 -1
- package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +3 -1
- package/deps/rocksdb/rocksdb/memtable/skiplist.h +2 -2
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +2 -4
- package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +69 -8
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.cc +32 -9
- package/deps/rocksdb/rocksdb/memtable/wbwi_memtable.h +58 -45
- package/deps/rocksdb/rocksdb/monitoring/histogram.h +1 -1
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -3
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +1 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +3 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +44 -13
- package/deps/rocksdb/rocksdb/options/cf_options.h +21 -7
- package/deps/rocksdb/rocksdb/options/configurable.cc +5 -5
- package/deps/rocksdb/rocksdb/options/configurable_test.h +1 -2
- package/deps/rocksdb/rocksdb/options/customizable.cc +0 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -11
- package/deps/rocksdb/rocksdb/options/db_options.cc +18 -15
- package/deps/rocksdb/rocksdb/options/db_options.h +2 -2
- package/deps/rocksdb/rocksdb/options/options.cc +296 -305
- package/deps/rocksdb/rocksdb/options/options_helper.cc +188 -62
- package/deps/rocksdb/rocksdb/options/options_helper.h +3 -3
- package/deps/rocksdb/rocksdb/options/options_parser.cc +2 -4
- package/deps/rocksdb/rocksdb/options/options_parser.h +0 -1
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +17 -4
- package/deps/rocksdb/rocksdb/options/options_test.cc +101 -76
- package/deps/rocksdb/rocksdb/port/lang.h +2 -1
- package/deps/rocksdb/rocksdb/port/port_posix.cc +2 -1
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +5 -4
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +3 -2
- package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +99 -1
- package/deps/rocksdb/rocksdb/port/win/xpress_win.h +6 -0
- package/deps/rocksdb/rocksdb/src.mk +17 -11
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1094 -929
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +6 -19
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +76 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +221 -131
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +12 -9
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +23 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -38
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +10 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +35 -43
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +0 -4
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +0 -1
- package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -5
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +4 -4
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +37 -35
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +11 -7
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +4 -3
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +31 -5
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +0 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +0 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +0 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +483 -0
- package/deps/rocksdb/rocksdb/table/format.cc +62 -44
- package/deps/rocksdb/rocksdb/table/format.h +35 -12
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +3 -13
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +6 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +150 -141
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +5 -0
- package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +8 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +0 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +0 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +0 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +6 -6
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +86 -7
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +88 -2
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +0 -1
- package/deps/rocksdb/rocksdb/table/table_builder.h +10 -1
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +3 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +899 -22
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +3 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.h +132 -1
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +0 -1
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.h +0 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +163 -77
- package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +0 -2
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +120 -52
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +0 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +0 -2
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +2 -1
- package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +94 -0
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +0 -1
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +1 -1
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +2 -1
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +3 -5
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +15 -8
- package/deps/rocksdb/rocksdb/util/auto_skip_compressor.cc +131 -0
- package/deps/rocksdb/rocksdb/util/auto_skip_compressor.h +90 -0
- package/deps/rocksdb/rocksdb/util/autovector.h +1 -1
- package/deps/rocksdb/rocksdb/util/autovector_test.cc +2 -2
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +0 -2
- package/deps/rocksdb/rocksdb/util/compression.cc +936 -4
- package/deps/rocksdb/rocksdb/util/compression.h +348 -232
- package/deps/rocksdb/rocksdb/util/compression_test.cc +229 -0
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +10 -10
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.c +1 -0
- package/deps/rocksdb/rocksdb/util/data_structure.cc +2 -0
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +1 -3
- package/deps/rocksdb/rocksdb/util/ppc-opcode.h +5 -5
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +108 -0
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +67 -0
- package/deps/rocksdb/rocksdb/util/slice_test.cc +83 -0
- package/deps/rocksdb/rocksdb/util/string_util.cc +0 -2
- package/deps/rocksdb/rocksdb/util/string_util.h +10 -0
- package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
- package/deps/rocksdb/rocksdb/util/udt_util.cc +18 -5
- package/deps/rocksdb/rocksdb/util/udt_util.h +10 -7
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +650 -154
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +438 -144
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +16 -17
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +2 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +7 -8
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +0 -48
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/debug.cc +7 -14
- package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/env_mirror_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/env_timed.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/env_timed_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +10 -9
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/lrulist.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index.cc +183 -32
- package/deps/rocksdb/rocksdb/utilities/secondary_index/faiss_ivf_index_test.cc +258 -12
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_helper.h +33 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_iterator.cc +99 -0
- package/deps/rocksdb/rocksdb/utilities/secondary_index/secondary_index_mixin.h +280 -120
- package/deps/rocksdb/rocksdb/utilities/secondary_index/simple_secondary_index.cc +79 -0
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +52 -16
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h +10 -6
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +55 -0
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +37 -12
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +2 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +1 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +36 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +4 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/snapshot_checker.cc +1 -4
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +1118 -37
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +4 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +0 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +0 -3
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +125 -127
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +45 -23
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +54 -22
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +477 -58
- package/deps/rocksdb/rocksdb.gyp +9 -4
- package/index.js +50 -9
- package/package.json +8 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -67,7 +67,7 @@ Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
67
67
|
return s;
|
|
68
68
|
}
|
|
69
69
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
70
|
-
if (!cfh->cfd()->ioptions()
|
|
70
|
+
if (!cfh->cfd()->ioptions().merge_operator) {
|
|
71
71
|
return Status::NotSupported("Provide a merge_operator when opening DB");
|
|
72
72
|
} else {
|
|
73
73
|
return DB::Merge(o, column_family, key, val);
|
|
@@ -157,7 +157,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
|
|
|
157
157
|
if (s.ok()) {
|
|
158
158
|
s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
|
|
159
159
|
/*user_write_cb=*/nullptr,
|
|
160
|
-
/*
|
|
160
|
+
/*wal_used=*/nullptr);
|
|
161
161
|
}
|
|
162
162
|
return s;
|
|
163
163
|
}
|
|
@@ -190,11 +190,38 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
|
|
|
190
190
|
return s;
|
|
191
191
|
}
|
|
192
192
|
|
|
193
|
-
Status DBImpl::
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
193
|
+
Status DBImpl::IngestWriteBatchWithIndex(
|
|
194
|
+
const WriteOptions& write_options,
|
|
195
|
+
std::shared_ptr<WriteBatchWithIndex> wbwi) {
|
|
196
|
+
if (!wbwi) {
|
|
197
|
+
return Status::InvalidArgument("Batch is nullptr!");
|
|
198
|
+
}
|
|
199
|
+
if (!write_options.disableWAL) {
|
|
200
|
+
return Status::NotSupported(
|
|
201
|
+
"IngestWriteBatchWithIndex does not support disableWAL=true");
|
|
202
|
+
}
|
|
203
|
+
Status s;
|
|
204
|
+
if (write_options.protection_bytes_per_key > 0) {
|
|
205
|
+
s = WriteBatchInternal::UpdateProtectionInfo(
|
|
206
|
+
wbwi->GetWriteBatch(), write_options.protection_bytes_per_key);
|
|
207
|
+
}
|
|
208
|
+
if (s.ok()) {
|
|
209
|
+
WriteBatch dummy_empty_batch;
|
|
210
|
+
s = WriteImpl(
|
|
211
|
+
write_options, /*updates=*/&dummy_empty_batch, /*callback=*/nullptr,
|
|
212
|
+
/*user_write_cb=*/nullptr, /*log_used=*/nullptr, /*log_ref=*/0,
|
|
213
|
+
/*disable_memtable=*/false, /*seq_used=*/nullptr,
|
|
214
|
+
/*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
|
|
215
|
+
/*post_memtable_callback=*/nullptr, /*wbwi=*/wbwi);
|
|
216
|
+
}
|
|
217
|
+
return s;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
Status DBImpl::IngestWBWIAsMemtable(
|
|
221
|
+
std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
222
|
+
const WBWIMemTable::SeqnoRange& assigned_seqno, uint64_t min_prep_log,
|
|
223
|
+
SequenceNumber last_seqno_after_ingest, bool memtable_updated,
|
|
224
|
+
bool ignore_missing_cf) {
|
|
198
225
|
// Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
|
|
199
226
|
assert(assigned_seqno.upper_bound <= last_seqno_after_ingest);
|
|
200
227
|
// Keys in the current memtable have seqno <= LastSequence() < keys in wbwi.
|
|
@@ -205,7 +232,7 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
|
205
232
|
ColumnFamilySet* cf_set = versions_->GetColumnFamilySet();
|
|
206
233
|
|
|
207
234
|
// Create WBWIMemTables
|
|
208
|
-
for (const auto [cf_id, stat] : wbwi->GetCFStats()) {
|
|
235
|
+
for (const auto& [cf_id, stat] : wbwi->GetCFStats()) {
|
|
209
236
|
ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf_id);
|
|
210
237
|
if (!cfd) {
|
|
211
238
|
if (ignore_missing_cf) {
|
|
@@ -232,18 +259,36 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
|
232
259
|
return s;
|
|
233
260
|
}
|
|
234
261
|
WBWIMemTable* wbwi_memtable =
|
|
235
|
-
new WBWIMemTable(wbwi, cfd->user_comparator(), cf_id, cfd->ioptions(),
|
|
236
|
-
cfd->GetLatestMutableCFOptions(), stat);
|
|
262
|
+
new WBWIMemTable(wbwi, cfd->user_comparator(), cf_id, &cfd->ioptions(),
|
|
263
|
+
&cfd->GetLatestMutableCFOptions(), stat);
|
|
237
264
|
wbwi_memtable->Ref();
|
|
238
265
|
wbwi_memtable->AssignSequenceNumbers(assigned_seqno);
|
|
239
266
|
// This is needed to keep the WAL that contains Prepare alive until
|
|
240
267
|
// committed data in this memtable is persisted.
|
|
241
|
-
wbwi_memtable->SetMinPrepLog(
|
|
268
|
+
wbwi_memtable->SetMinPrepLog(min_prep_log);
|
|
242
269
|
memtables.push_back(wbwi_memtable);
|
|
243
270
|
cfd->Ref();
|
|
244
271
|
cfds.push_back(cfd);
|
|
245
272
|
}
|
|
246
273
|
|
|
274
|
+
autovector<ColumnFamilyData*> cfds_for_atomic_flush;
|
|
275
|
+
if (immutable_db_options_.atomic_flush) {
|
|
276
|
+
SelectColumnFamiliesForAtomicFlush(&cfds_for_atomic_flush);
|
|
277
|
+
for (auto cfd : cfds_for_atomic_flush) {
|
|
278
|
+
bool found = false;
|
|
279
|
+
for (auto existing_cfd : cfds) {
|
|
280
|
+
if (existing_cfd == cfd) {
|
|
281
|
+
found = true;
|
|
282
|
+
break;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
if (!found) {
|
|
286
|
+
cfd->Ref();
|
|
287
|
+
cfds.push_back(cfd);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
247
292
|
// Stop writes to the DB by entering both write threads
|
|
248
293
|
WriteThread::Writer nonmem_w;
|
|
249
294
|
if (two_write_queues_) {
|
|
@@ -253,15 +298,16 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
|
253
298
|
|
|
254
299
|
// Switch memtable and add WBWIMemTables
|
|
255
300
|
Status s;
|
|
256
|
-
for (size_t i = 0; i <
|
|
257
|
-
assert(!immutable_db_options_.atomic_flush);
|
|
258
|
-
// NOTE: to support atomic flush, need to call
|
|
259
|
-
// SelectColumnFamiliesForAtomicFlush()
|
|
301
|
+
for (size_t i = 0; i < cfds.size(); ++i) {
|
|
260
302
|
WriteContext write_context;
|
|
261
303
|
// TODO: not switch on empty memtable, may need to update metadata
|
|
262
304
|
// like NextLogNumber(), earliest_seqno and memtable id.
|
|
263
|
-
|
|
264
|
-
|
|
305
|
+
if (i < memtables.size()) {
|
|
306
|
+
s = SwitchMemtable(cfds[i], &write_context, memtables[i],
|
|
307
|
+
last_seqno_after_ingest);
|
|
308
|
+
} else {
|
|
309
|
+
s = SwitchMemtable(cfds[i], &write_context);
|
|
310
|
+
}
|
|
265
311
|
if (!s.ok()) {
|
|
266
312
|
// SwitchMemtable() can only fail if a new WAL is to be created, this
|
|
267
313
|
// should only happen for the first call to SwitchMemtable(). log will
|
|
@@ -301,9 +347,18 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
|
301
347
|
continue;
|
|
302
348
|
}
|
|
303
349
|
cfd->imm()->FlushRequested();
|
|
350
|
+
if (!immutable_db_options_.atomic_flush) {
|
|
351
|
+
FlushRequest flush_req;
|
|
352
|
+
// TODO: a new flush reason for ingesting memtable
|
|
353
|
+
GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
|
|
354
|
+
&flush_req);
|
|
355
|
+
EnqueuePendingFlush(flush_req);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
if (immutable_db_options_.atomic_flush) {
|
|
359
|
+
AssignAtomicFlushSeq(cfds);
|
|
304
360
|
FlushRequest flush_req;
|
|
305
|
-
|
|
306
|
-
GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
|
|
361
|
+
GenerateFlushRequest(cfds, FlushReason::kExternalFileIngestion,
|
|
307
362
|
&flush_req);
|
|
308
363
|
EnqueuePendingFlush(flush_req);
|
|
309
364
|
}
|
|
@@ -314,13 +369,12 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
|
|
|
314
369
|
|
|
315
370
|
Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
316
371
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
317
|
-
UserWriteCallback* user_write_cb, uint64_t*
|
|
372
|
+
UserWriteCallback* user_write_cb, uint64_t* wal_used,
|
|
318
373
|
uint64_t log_ref, bool disable_memtable,
|
|
319
374
|
uint64_t* seq_used, size_t batch_cnt,
|
|
320
375
|
PreReleaseCallback* pre_release_callback,
|
|
321
376
|
PostMemTableCallback* post_memtable_callback,
|
|
322
|
-
std::shared_ptr<WriteBatchWithIndex> wbwi
|
|
323
|
-
uint64_t prep_log) {
|
|
377
|
+
std::shared_ptr<WriteBatchWithIndex> wbwi) {
|
|
324
378
|
assert(!seq_per_batch_ || batch_cnt != 0);
|
|
325
379
|
assert(my_batch == nullptr || my_batch->Count() == 0 ||
|
|
326
380
|
write_options.protection_bytes_per_key == 0 ||
|
|
@@ -409,9 +463,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
409
463
|
return Status::NotSupported(
|
|
410
464
|
"DeleteRange is not compatible with row cache.");
|
|
411
465
|
}
|
|
466
|
+
// Whether the WBWI is from transaction commit or a direct write
|
|
467
|
+
// (IngestWriteBatchWithIndex())
|
|
468
|
+
bool ingest_wbwi_for_commit = false;
|
|
412
469
|
if (wbwi) {
|
|
413
|
-
|
|
414
|
-
|
|
470
|
+
if (my_batch->HasCommit()) {
|
|
471
|
+
ingest_wbwi_for_commit = true;
|
|
472
|
+
assert(log_ref);
|
|
473
|
+
} else {
|
|
474
|
+
// Only supports disableWAL for directly ingesting WBWI for now.
|
|
475
|
+
assert(write_options.disableWAL);
|
|
476
|
+
}
|
|
415
477
|
assert(!callback);
|
|
416
478
|
if (immutable_db_options_.unordered_write) {
|
|
417
479
|
return Status::NotSupported(
|
|
@@ -421,9 +483,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
421
483
|
return Status::NotSupported(
|
|
422
484
|
"Ingesting WriteBatch does not support pipelined_write");
|
|
423
485
|
}
|
|
424
|
-
if (
|
|
486
|
+
if (!wbwi->GetOverwriteKey()) {
|
|
425
487
|
return Status::NotSupported(
|
|
426
|
-
"
|
|
488
|
+
"WriteBatchWithIndex ingestion requires overwrite_key=true");
|
|
427
489
|
}
|
|
428
490
|
}
|
|
429
491
|
// Otherwise IsLatestPersistentState optimization does not make sense
|
|
@@ -444,7 +506,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
444
506
|
// they don't consume sequence.
|
|
445
507
|
return WriteImplWALOnly(
|
|
446
508
|
&nonmem_write_thread_, write_options, my_batch, callback, user_write_cb,
|
|
447
|
-
|
|
509
|
+
wal_used, log_ref, seq_used, batch_cnt, pre_release_callback,
|
|
448
510
|
assign_order, kDontPublishLastSeq, disable_memtable);
|
|
449
511
|
}
|
|
450
512
|
|
|
@@ -458,7 +520,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
458
520
|
// sequence in in increasing order, iii) call pre_release_callback serially
|
|
459
521
|
Status status = WriteImplWALOnly(
|
|
460
522
|
&write_thread_, write_options, my_batch, callback, user_write_cb,
|
|
461
|
-
|
|
523
|
+
wal_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
|
|
462
524
|
kDoAssignOrder, kDoPublishLastSeq, disable_memtable);
|
|
463
525
|
TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
|
|
464
526
|
if (!status.ok()) {
|
|
@@ -477,7 +539,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
477
539
|
|
|
478
540
|
if (immutable_db_options_.enable_pipelined_write) {
|
|
479
541
|
return PipelinedWriteImpl(write_options, my_batch, callback, user_write_cb,
|
|
480
|
-
|
|
542
|
+
wal_used, log_ref, disable_memtable, seq_used);
|
|
481
543
|
}
|
|
482
544
|
|
|
483
545
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
@@ -524,16 +586,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
524
586
|
assert(tmp_s.ok());
|
|
525
587
|
}
|
|
526
588
|
}
|
|
527
|
-
|
|
528
|
-
|
|
589
|
+
if (w.status.ok()) { // Don't publish a partial batch write
|
|
590
|
+
versions_->SetLastSequence(last_sequence);
|
|
591
|
+
} else {
|
|
592
|
+
HandleMemTableInsertFailure(w.status);
|
|
593
|
+
}
|
|
529
594
|
write_thread_.ExitAsBatchGroupFollower(&w);
|
|
530
595
|
}
|
|
531
596
|
assert(w.state == WriteThread::STATE_COMPLETED);
|
|
532
597
|
// STATE_COMPLETED conditional below handles exit
|
|
533
598
|
}
|
|
534
599
|
if (w.state == WriteThread::STATE_COMPLETED) {
|
|
535
|
-
if (
|
|
536
|
-
*
|
|
600
|
+
if (wal_used != nullptr) {
|
|
601
|
+
*wal_used = w.wal_used;
|
|
537
602
|
}
|
|
538
603
|
if (seq_used != nullptr) {
|
|
539
604
|
*seq_used = w.sequence;
|
|
@@ -549,7 +614,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
549
614
|
// when it finds suitable, and finish them in the same write batch.
|
|
550
615
|
// This is how a write job could be done by the other writer.
|
|
551
616
|
WriteContext write_context;
|
|
552
|
-
|
|
617
|
+
// FIXME: also check disableWAL like others?
|
|
618
|
+
WalContext wal_context(write_options.sync);
|
|
553
619
|
WriteThread::WriteGroup write_group;
|
|
554
620
|
bool in_parallel_group = false;
|
|
555
621
|
uint64_t last_sequence = kMaxSequenceNumber;
|
|
@@ -563,7 +629,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
563
629
|
// PreprocessWrite does its own perf timing.
|
|
564
630
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
565
631
|
|
|
566
|
-
status = PreprocessWrite(write_options, &
|
|
632
|
+
status = PreprocessWrite(write_options, &wal_context, &write_context);
|
|
567
633
|
if (!two_write_queues_) {
|
|
568
634
|
// Assign it after ::PreprocessWrite since the sequence might advance
|
|
569
635
|
// inside it by WriteRecoverableState
|
|
@@ -587,6 +653,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
587
653
|
|
|
588
654
|
IOStatus io_s;
|
|
589
655
|
Status pre_release_cb_status;
|
|
656
|
+
size_t seq_inc = 0;
|
|
590
657
|
if (status.ok()) {
|
|
591
658
|
// Rules for when we can update the memtable concurrently
|
|
592
659
|
// 1. supported by memtable
|
|
@@ -630,7 +697,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
630
697
|
continue;
|
|
631
698
|
}
|
|
632
699
|
// TODO: maybe handle the tracing status?
|
|
633
|
-
|
|
700
|
+
if (wbwi && !ingest_wbwi_for_commit) {
|
|
701
|
+
// for transaction write, tracer only needs the commit marker which
|
|
702
|
+
// is in writer->batch
|
|
703
|
+
tracer_->Write(wbwi->GetWriteBatch()).PermitUncheckedError();
|
|
704
|
+
} else {
|
|
705
|
+
tracer_->Write(writer->batch).PermitUncheckedError();
|
|
706
|
+
}
|
|
634
707
|
}
|
|
635
708
|
}
|
|
636
709
|
}
|
|
@@ -640,7 +713,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
640
713
|
// disable_memtable in between; although we do not write this batch to
|
|
641
714
|
// memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
|
|
642
715
|
// the seq per valid written key to mem.
|
|
643
|
-
|
|
716
|
+
seq_inc = seq_per_batch_ ? valid_batches : total_count;
|
|
644
717
|
if (wbwi) {
|
|
645
718
|
// Reserve sequence numbers for the ingested memtable. We need to reserve
|
|
646
719
|
// at lease this amount for recovery. During recovery,
|
|
@@ -688,22 +761,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
688
761
|
|
|
689
762
|
if (!two_write_queues_) {
|
|
690
763
|
if (status.ok() && !write_options.disableWAL) {
|
|
691
|
-
assert(
|
|
692
|
-
|
|
693
|
-
*(log_context.log_file_number_size);
|
|
764
|
+
assert(wal_context.wal_file_number_size);
|
|
765
|
+
wal_context.prev_size = wal_context.writer->file()->GetFileSize();
|
|
694
766
|
PERF_TIMER_GUARD(write_wal_time);
|
|
695
|
-
io_s =
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
767
|
+
io_s = WriteGroupToWAL(write_group, wal_context.writer, wal_used,
|
|
768
|
+
wal_context.need_wal_sync,
|
|
769
|
+
wal_context.need_wal_dir_sync, last_sequence + 1,
|
|
770
|
+
*wal_context.wal_file_number_size);
|
|
699
771
|
}
|
|
700
772
|
} else {
|
|
701
773
|
if (status.ok() && !write_options.disableWAL) {
|
|
702
774
|
PERF_TIMER_GUARD(write_wal_time);
|
|
703
775
|
// LastAllocatedSequence is increased inside WriteToWAL under
|
|
704
776
|
// wal_write_mutex_ to ensure ordered events in WAL
|
|
705
|
-
io_s =
|
|
706
|
-
|
|
777
|
+
io_s = ConcurrentWriteGroupToWAL(write_group, wal_used, &last_sequence,
|
|
778
|
+
seq_inc);
|
|
707
779
|
} else {
|
|
708
780
|
// Otherwise we inc seq number for memtable writes
|
|
709
781
|
last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
|
@@ -713,17 +785,18 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
713
785
|
assert(last_sequence != kMaxSequenceNumber);
|
|
714
786
|
const SequenceNumber current_sequence = last_sequence + 1;
|
|
715
787
|
last_sequence += seq_inc;
|
|
788
|
+
// Seqno assigned to this write are [current_sequence, last_sequence]
|
|
716
789
|
|
|
717
|
-
if (
|
|
790
|
+
if (wal_context.need_wal_sync) {
|
|
718
791
|
VersionEdit synced_wals;
|
|
719
|
-
|
|
792
|
+
wal_write_mutex_.Lock();
|
|
720
793
|
if (status.ok()) {
|
|
721
|
-
MarkLogsSynced(
|
|
794
|
+
MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
|
|
722
795
|
&synced_wals);
|
|
723
796
|
} else {
|
|
724
|
-
MarkLogsNotSynced(
|
|
797
|
+
MarkLogsNotSynced(cur_wal_number_);
|
|
725
798
|
}
|
|
726
|
-
|
|
799
|
+
wal_write_mutex_.Unlock();
|
|
727
800
|
if (status.ok() && synced_wals.IsWalAddition()) {
|
|
728
801
|
InstrumentedMutexLock l(&mutex_);
|
|
729
802
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
@@ -758,7 +831,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
758
831
|
writer->sequence = next_sequence;
|
|
759
832
|
if (writer->pre_release_callback) {
|
|
760
833
|
Status ws = writer->pre_release_callback->Callback(
|
|
761
|
-
writer->sequence, disable_memtable, writer->
|
|
834
|
+
writer->sequence, disable_memtable, writer->wal_used, index++,
|
|
762
835
|
pre_release_callback_cnt);
|
|
763
836
|
if (!ws.ok()) {
|
|
764
837
|
status = pre_release_cb_status = ws;
|
|
@@ -783,8 +856,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
783
856
|
write_group, current_sequence, column_family_memtables_.get(),
|
|
784
857
|
&flush_scheduler_, &trim_history_scheduler_,
|
|
785
858
|
write_options.ignore_missing_column_families,
|
|
786
|
-
0 /*recovery_log_number*/, this,
|
|
787
|
-
batch_per_txn_);
|
|
859
|
+
0 /*recovery_log_number*/, this, seq_per_batch_, batch_per_txn_);
|
|
788
860
|
} else {
|
|
789
861
|
write_group.last_sequence = last_sequence;
|
|
790
862
|
write_thread_.LaunchParallelMemTableWriters(&write_group);
|
|
@@ -832,24 +904,31 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
832
904
|
// handle exit, false means somebody else did
|
|
833
905
|
should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
|
|
834
906
|
}
|
|
835
|
-
if (wbwi) {
|
|
836
|
-
|
|
907
|
+
if (wbwi && status.ok() && w.status.ok()) {
|
|
908
|
+
uint32_t wbwi_count = wbwi->GetWriteBatch()->Count();
|
|
909
|
+
// skip empty batch case
|
|
910
|
+
if (wbwi_count) {
|
|
837
911
|
// w.batch contains (potentially empty) commit time batch updates,
|
|
838
912
|
// only ingest wbwi if w.batch is applied to memtable successfully
|
|
839
|
-
assert(wbwi->GetWriteBatch()->Count() > 0);
|
|
840
|
-
|
|
841
913
|
uint32_t memtable_update_count = w.batch->Count();
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
914
|
+
// Seqno assigned to this write are [last_seq + 1 - seq_inc, last_seq].
|
|
915
|
+
// seq_inc includes w.batch (memtable updates) and wbwi
|
|
916
|
+
// w.batch gets first `memtable_update_count` sequence numbers.
|
|
917
|
+
// wbwi gets the rest `wbwi_count` sequence numbers.
|
|
918
|
+
assert(seq_inc == memtable_update_count + wbwi_count);
|
|
919
|
+
assert(wbwi_count > 0);
|
|
920
|
+
assert(last_sequence != kMaxSequenceNumber);
|
|
921
|
+
SequenceNumber lb = last_sequence + 1 - wbwi_count;
|
|
922
|
+
SequenceNumber ub = last_sequence;
|
|
846
923
|
if (two_write_queues_) {
|
|
847
924
|
assert(ub <= versions_->LastAllocatedSequence());
|
|
848
925
|
}
|
|
849
|
-
status =
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
926
|
+
status =
|
|
927
|
+
IngestWBWIAsMemtable(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
|
|
928
|
+
/*min_prep_log=*/log_ref, last_sequence,
|
|
929
|
+
/*memtable_updated=*/memtable_update_count > 0,
|
|
930
|
+
write_options.ignore_missing_column_families);
|
|
931
|
+
RecordTick(stats_, NUMBER_WBWI_INGEST);
|
|
853
932
|
}
|
|
854
933
|
}
|
|
855
934
|
|
|
@@ -867,9 +946,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
867
946
|
}
|
|
868
947
|
// Note: if we are to resume after non-OK statuses we need to revisit how
|
|
869
948
|
// we react to non-OK statuses here.
|
|
870
|
-
|
|
949
|
+
if (w.status.ok()) { // Don't publish a partial batch write
|
|
950
|
+
versions_->SetLastSequence(last_sequence);
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
if (!w.status.ok()) {
|
|
954
|
+
if (wal_context.prev_size < SIZE_MAX) {
|
|
955
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
956
|
+
if (logs_.back().number == wal_context.wal_file_number_size->number) {
|
|
957
|
+
logs_.back().SetAttemptTruncateSize(wal_context.prev_size);
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
HandleMemTableInsertFailure(w.status);
|
|
871
961
|
}
|
|
872
|
-
MemTableInsertStatusCheck(w.status);
|
|
873
962
|
write_thread_.ExitAsBatchGroupLeader(write_group, status);
|
|
874
963
|
}
|
|
875
964
|
|
|
@@ -882,7 +971,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
882
971
|
Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
883
972
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
884
973
|
UserWriteCallback* user_write_cb,
|
|
885
|
-
uint64_t*
|
|
974
|
+
uint64_t* wal_used, uint64_t log_ref,
|
|
886
975
|
bool disable_memtable, uint64_t* seq_used) {
|
|
887
976
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
888
977
|
StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
|
|
@@ -899,10 +988,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
899
988
|
if (w.callback && !w.callback->AllowWriteBatching()) {
|
|
900
989
|
write_thread_.WaitForMemTableWriters();
|
|
901
990
|
}
|
|
902
|
-
|
|
991
|
+
WalContext wal_context(!write_options.disableWAL && write_options.sync);
|
|
903
992
|
// PreprocessWrite does its own perf timing.
|
|
904
993
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
905
|
-
w.status = PreprocessWrite(write_options, &
|
|
994
|
+
w.status = PreprocessWrite(write_options, &wal_context, &write_context);
|
|
906
995
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
907
996
|
|
|
908
997
|
// This can set non-OK status if callback fail.
|
|
@@ -971,13 +1060,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
971
1060
|
wal_write_group.size - 1);
|
|
972
1061
|
RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
|
|
973
1062
|
}
|
|
974
|
-
assert(
|
|
975
|
-
|
|
976
|
-
*(
|
|
977
|
-
io_s =
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1063
|
+
assert(wal_context.wal_file_number_size);
|
|
1064
|
+
WalFileNumberSize& wal_file_number_size =
|
|
1065
|
+
*(wal_context.wal_file_number_size);
|
|
1066
|
+
io_s = WriteGroupToWAL(wal_write_group, wal_context.writer, wal_used,
|
|
1067
|
+
wal_context.need_wal_sync,
|
|
1068
|
+
wal_context.need_wal_dir_sync, current_sequence,
|
|
1069
|
+
wal_file_number_size);
|
|
981
1070
|
w.status = io_s;
|
|
982
1071
|
}
|
|
983
1072
|
|
|
@@ -989,13 +1078,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
989
1078
|
}
|
|
990
1079
|
|
|
991
1080
|
VersionEdit synced_wals;
|
|
992
|
-
if (
|
|
993
|
-
InstrumentedMutexLock l(&
|
|
1081
|
+
if (wal_context.need_wal_sync) {
|
|
1082
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
994
1083
|
if (w.status.ok()) {
|
|
995
|
-
MarkLogsSynced(
|
|
1084
|
+
MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
|
|
996
1085
|
&synced_wals);
|
|
997
1086
|
} else {
|
|
998
|
-
MarkLogsNotSynced(
|
|
1087
|
+
MarkLogsNotSynced(cur_wal_number_);
|
|
999
1088
|
}
|
|
1000
1089
|
}
|
|
1001
1090
|
if (w.status.ok() && synced_wals.IsWalAddition()) {
|
|
@@ -1025,8 +1114,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
1025
1114
|
memtable_write_group, w.sequence, column_family_memtables_.get(),
|
|
1026
1115
|
&flush_scheduler_, &trim_history_scheduler_,
|
|
1027
1116
|
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
|
|
1028
|
-
|
|
1029
|
-
|
|
1117
|
+
seq_per_batch_, batch_per_txn_);
|
|
1118
|
+
if (memtable_write_group.status
|
|
1119
|
+
.ok()) { // Don't publish a partial batch write
|
|
1120
|
+
versions_->SetLastSequence(memtable_write_group.last_sequence);
|
|
1121
|
+
} else {
|
|
1122
|
+
HandleMemTableInsertFailure(memtable_write_group.status);
|
|
1123
|
+
}
|
|
1030
1124
|
write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
|
|
1031
1125
|
}
|
|
1032
1126
|
} else {
|
|
@@ -1055,8 +1149,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
1055
1149
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
1056
1150
|
|
|
1057
1151
|
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
|
|
1058
|
-
|
|
1059
|
-
|
|
1152
|
+
if (w.status.ok()) { // Don't publish a partial batch write
|
|
1153
|
+
versions_->SetLastSequence(w.write_group->last_sequence);
|
|
1154
|
+
} else {
|
|
1155
|
+
HandleMemTableInsertFailure(w.status);
|
|
1156
|
+
}
|
|
1060
1157
|
write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
|
|
1061
1158
|
}
|
|
1062
1159
|
}
|
|
@@ -1128,7 +1225,7 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
|
|
|
1128
1225
|
Status DBImpl::WriteImplWALOnly(
|
|
1129
1226
|
WriteThread* write_thread, const WriteOptions& write_options,
|
|
1130
1227
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
1131
|
-
UserWriteCallback* user_write_cb, uint64_t*
|
|
1228
|
+
UserWriteCallback* user_write_cb, uint64_t* wal_used,
|
|
1132
1229
|
const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
|
|
1133
1230
|
PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
|
|
1134
1231
|
const PublishLastSeq publish_last_seq, const bool disable_memtable) {
|
|
@@ -1141,8 +1238,8 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1141
1238
|
write_thread->JoinBatchGroup(&w);
|
|
1142
1239
|
assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
|
|
1143
1240
|
if (w.state == WriteThread::STATE_COMPLETED) {
|
|
1144
|
-
if (
|
|
1145
|
-
*
|
|
1241
|
+
if (wal_used != nullptr) {
|
|
1242
|
+
*wal_used = w.wal_used;
|
|
1146
1243
|
}
|
|
1147
1244
|
if (seq_used != nullptr) {
|
|
1148
1245
|
*seq_used = w.sequence;
|
|
@@ -1158,10 +1255,10 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1158
1255
|
|
|
1159
1256
|
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them
|
|
1160
1257
|
// without paying the cost of obtaining the mutex.
|
|
1161
|
-
|
|
1258
|
+
WalContext wal_context;
|
|
1162
1259
|
WriteContext write_context;
|
|
1163
1260
|
Status status =
|
|
1164
|
-
PreprocessWrite(write_options, &
|
|
1261
|
+
PreprocessWrite(write_options, &wal_context, &write_context);
|
|
1165
1262
|
WriteStatusCheckOnLocked(status);
|
|
1166
1263
|
|
|
1167
1264
|
if (!status.ok()) {
|
|
@@ -1258,8 +1355,8 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1258
1355
|
}
|
|
1259
1356
|
Status status;
|
|
1260
1357
|
if (!write_options.disableWAL) {
|
|
1261
|
-
IOStatus io_s =
|
|
1262
|
-
|
|
1358
|
+
IOStatus io_s = ConcurrentWriteGroupToWAL(write_group, wal_used,
|
|
1359
|
+
&last_sequence, seq_inc);
|
|
1263
1360
|
status = io_s;
|
|
1264
1361
|
// last_sequence may not be set if there is an error
|
|
1265
1362
|
// This error checking and return is moved up to avoid using uninitialized
|
|
@@ -1311,7 +1408,7 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1311
1408
|
if (!writer->CallbackFailed() && writer->pre_release_callback) {
|
|
1312
1409
|
assert(writer->sequence != kMaxSequenceNumber);
|
|
1313
1410
|
Status ws = writer->pre_release_callback->Callback(
|
|
1314
|
-
writer->sequence, disable_memtable, writer->
|
|
1411
|
+
writer->sequence, disable_memtable, writer->wal_used, index++,
|
|
1315
1412
|
pre_release_callback_cnt);
|
|
1316
1413
|
if (!ws.ok()) {
|
|
1317
1414
|
status = ws;
|
|
@@ -1380,24 +1477,22 @@ void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
|
|
|
1380
1477
|
}
|
|
1381
1478
|
}
|
|
1382
1479
|
|
|
1383
|
-
void DBImpl::
|
|
1384
|
-
|
|
1385
|
-
//
|
|
1386
|
-
//
|
|
1387
|
-
//
|
|
1388
|
-
// ignore_missing_column_families.
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
mutex_.Unlock();
|
|
1394
|
-
}
|
|
1480
|
+
void DBImpl::HandleMemTableInsertFailure(const Status& status) {
|
|
1481
|
+
assert(!status.ok());
|
|
1482
|
+
// A non-OK status on memtable insert indicates that the state implied by the
|
|
1483
|
+
// WAL has diverged from the in-memory state. This could be because of a
|
|
1484
|
+
// corrupt write_batch (very bad), or because the client specified an invalid
|
|
1485
|
+
// column family and didn't specify ignore_missing_column_families.
|
|
1486
|
+
mutex_.Lock();
|
|
1487
|
+
assert(!error_handler_.IsBGWorkStopped());
|
|
1488
|
+
error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
|
|
1489
|
+
mutex_.Unlock();
|
|
1395
1490
|
}
|
|
1396
1491
|
|
|
1397
1492
|
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
1398
|
-
|
|
1493
|
+
WalContext* wal_context,
|
|
1399
1494
|
WriteContext* write_context) {
|
|
1400
|
-
assert(write_context != nullptr &&
|
|
1495
|
+
assert(write_context != nullptr && wal_context != nullptr);
|
|
1401
1496
|
Status status;
|
|
1402
1497
|
|
|
1403
1498
|
if (error_handler_.IsDBStopped()) {
|
|
@@ -1407,7 +1502,8 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1407
1502
|
|
|
1408
1503
|
PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
|
|
1409
1504
|
|
|
1410
|
-
if (UNLIKELY(status.ok() &&
|
|
1505
|
+
if (UNLIKELY(status.ok() &&
|
|
1506
|
+
wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize())) {
|
|
1411
1507
|
assert(versions_);
|
|
1412
1508
|
InstrumentedMutexLock l(&mutex_);
|
|
1413
1509
|
const ColumnFamilySet* const column_families =
|
|
@@ -1476,17 +1572,17 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1476
1572
|
WriteBufferManagerStallWrites();
|
|
1477
1573
|
}
|
|
1478
1574
|
}
|
|
1479
|
-
InstrumentedMutexLock l(&
|
|
1480
|
-
if (status.ok() &&
|
|
1575
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
1576
|
+
if (status.ok() && wal_context->need_wal_sync) {
|
|
1481
1577
|
// Wait until the parallel syncs are finished. Any sync process has to sync
|
|
1482
1578
|
// the front log too so it is enough to check the status of front()
|
|
1483
|
-
// We do a while loop since
|
|
1579
|
+
// We do a while loop since wal_sync_cv_ is signalled when any sync is
|
|
1484
1580
|
// finished
|
|
1485
1581
|
// Note: there does not seem to be a reason to wait for parallel sync at
|
|
1486
1582
|
// this early step but it is not important since parallel sync (SyncWAL) and
|
|
1487
|
-
//
|
|
1583
|
+
// need_wal_sync are usually not used together.
|
|
1488
1584
|
while (logs_.front().IsSyncing()) {
|
|
1489
|
-
|
|
1585
|
+
wal_sync_cv_.Wait();
|
|
1490
1586
|
}
|
|
1491
1587
|
for (auto& log : logs_) {
|
|
1492
1588
|
// This is just to prevent the logs to be synced by a parallel SyncWAL
|
|
@@ -1497,12 +1593,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1497
1593
|
log.PrepareForSync();
|
|
1498
1594
|
}
|
|
1499
1595
|
} else {
|
|
1500
|
-
|
|
1596
|
+
wal_context->need_wal_sync = false;
|
|
1501
1597
|
}
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1598
|
+
wal_context->writer = logs_.back().writer;
|
|
1599
|
+
wal_context->need_wal_dir_sync =
|
|
1600
|
+
wal_context->need_wal_dir_sync && !wal_dir_synced_;
|
|
1601
|
+
wal_context->wal_file_number_size = std::addressof(alive_wal_files_.back());
|
|
1506
1602
|
|
|
1507
1603
|
return status;
|
|
1508
1604
|
}
|
|
@@ -1553,12 +1649,13 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
|
|
|
1553
1649
|
}
|
|
1554
1650
|
|
|
1555
1651
|
// When two_write_queues_ is disabled, this function is called from the only
|
|
1556
|
-
// write thread. Otherwise this must be called holding
|
|
1652
|
+
// write thread. Otherwise this must be called holding wal_write_mutex_.
|
|
1557
1653
|
IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
|
1558
1654
|
const WriteOptions& write_options,
|
|
1559
|
-
log::Writer* log_writer, uint64_t*
|
|
1655
|
+
log::Writer* log_writer, uint64_t* wal_used,
|
|
1560
1656
|
uint64_t* log_size,
|
|
1561
|
-
|
|
1657
|
+
WalFileNumberSize& wal_file_number_size,
|
|
1658
|
+
SequenceNumber sequence) {
|
|
1562
1659
|
assert(log_size != nullptr);
|
|
1563
1660
|
|
|
1564
1661
|
Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
|
|
@@ -1569,7 +1666,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
|
|
1569
1666
|
}
|
|
1570
1667
|
*log_size = log_entry.size();
|
|
1571
1668
|
// When two_write_queues_ WriteToWAL has to be protected from concurretn calls
|
|
1572
|
-
// from the two queues anyway and
|
|
1669
|
+
// from the two queues anyway and wal_write_mutex_ is already held. Otherwise
|
|
1573
1670
|
// if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
|
|
1574
1671
|
// from possible concurrent calls via the FlushWAL by the application.
|
|
1575
1672
|
const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
|
|
@@ -1577,33 +1674,34 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
|
|
1577
1674
|
// manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
|
|
1578
1675
|
// when we do not need any locking.
|
|
1579
1676
|
if (UNLIKELY(needs_locking)) {
|
|
1580
|
-
|
|
1677
|
+
wal_write_mutex_.Lock();
|
|
1581
1678
|
}
|
|
1582
1679
|
IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord(
|
|
1583
1680
|
write_options, versions_->GetColumnFamiliesTimestampSizeForRecord());
|
|
1584
1681
|
if (!io_s.ok()) {
|
|
1585
1682
|
return io_s;
|
|
1586
1683
|
}
|
|
1587
|
-
io_s = log_writer->AddRecord(write_options, log_entry);
|
|
1684
|
+
io_s = log_writer->AddRecord(write_options, log_entry, sequence);
|
|
1588
1685
|
|
|
1589
1686
|
if (UNLIKELY(needs_locking)) {
|
|
1590
|
-
|
|
1687
|
+
wal_write_mutex_.Unlock();
|
|
1591
1688
|
}
|
|
1592
|
-
if (
|
|
1593
|
-
*
|
|
1689
|
+
if (wal_used != nullptr) {
|
|
1690
|
+
*wal_used = cur_wal_number_;
|
|
1691
|
+
assert(*wal_used == wal_file_number_size.number);
|
|
1594
1692
|
}
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1693
|
+
wals_total_size_.FetchAddRelaxed(log_entry.size());
|
|
1694
|
+
wal_file_number_size.AddSize(*log_size);
|
|
1695
|
+
wal_empty_ = false;
|
|
1598
1696
|
|
|
1599
1697
|
return io_s;
|
|
1600
1698
|
}
|
|
1601
1699
|
|
|
1602
|
-
IOStatus DBImpl::
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1700
|
+
IOStatus DBImpl::WriteGroupToWAL(const WriteThread::WriteGroup& write_group,
|
|
1701
|
+
log::Writer* log_writer, uint64_t* wal_used,
|
|
1702
|
+
bool need_wal_sync, bool need_wal_dir_sync,
|
|
1703
|
+
SequenceNumber sequence,
|
|
1704
|
+
WalFileNumberSize& wal_file_number_size) {
|
|
1607
1705
|
IOStatus io_s;
|
|
1608
1706
|
assert(!two_write_queues_);
|
|
1609
1707
|
assert(!write_group.leader->disable_wal);
|
|
@@ -1618,10 +1716,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
1618
1716
|
}
|
|
1619
1717
|
|
|
1620
1718
|
if (merged_batch == write_group.leader->batch) {
|
|
1621
|
-
write_group.leader->
|
|
1719
|
+
write_group.leader->wal_used = cur_wal_number_;
|
|
1622
1720
|
} else if (write_with_wal > 1) {
|
|
1623
1721
|
for (auto writer : write_group) {
|
|
1624
|
-
writer->
|
|
1722
|
+
writer->wal_used = cur_wal_number_;
|
|
1625
1723
|
}
|
|
1626
1724
|
}
|
|
1627
1725
|
|
|
@@ -1633,14 +1731,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
1633
1731
|
WriteOptions write_options;
|
|
1634
1732
|
write_options.rate_limiter_priority =
|
|
1635
1733
|
write_group.leader->rate_limiter_priority;
|
|
1636
|
-
io_s = WriteToWAL(*merged_batch, write_options, log_writer,
|
|
1637
|
-
&log_size,
|
|
1734
|
+
io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
|
|
1735
|
+
&log_size, wal_file_number_size, sequence);
|
|
1638
1736
|
if (to_be_cached_state) {
|
|
1639
1737
|
cached_recoverable_state_ = *to_be_cached_state;
|
|
1640
1738
|
cached_recoverable_state_empty_ = false;
|
|
1641
1739
|
}
|
|
1642
1740
|
|
|
1643
|
-
if (io_s.ok() &&
|
|
1741
|
+
if (io_s.ok() && need_wal_sync) {
|
|
1644
1742
|
StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
|
|
1645
1743
|
// It's safe to access logs_ with unlocked mutex_ here because:
|
|
1646
1744
|
// - we've set getting_synced=true for all logs,
|
|
@@ -1650,15 +1748,15 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
1650
1748
|
// - as long as other threads don't modify it, it's safe to read
|
|
1651
1749
|
// from std::deque from multiple threads concurrently.
|
|
1652
1750
|
//
|
|
1653
|
-
// Sync operation should work with locked
|
|
1751
|
+
// Sync operation should work with locked wal_write_mutex_, because:
|
|
1654
1752
|
// when DBOptions.manual_wal_flush_ is set,
|
|
1655
1753
|
// FlushWAL function will be invoked by another thread.
|
|
1656
|
-
// if without locked
|
|
1754
|
+
// if without locked wal_write_mutex_, the log file may get data
|
|
1657
1755
|
// corruption
|
|
1658
1756
|
|
|
1659
1757
|
const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
|
|
1660
1758
|
if (UNLIKELY(needs_locking)) {
|
|
1661
|
-
|
|
1759
|
+
wal_write_mutex_.Lock();
|
|
1662
1760
|
}
|
|
1663
1761
|
|
|
1664
1762
|
if (io_s.ok()) {
|
|
@@ -1681,10 +1779,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
1681
1779
|
}
|
|
1682
1780
|
|
|
1683
1781
|
if (UNLIKELY(needs_locking)) {
|
|
1684
|
-
|
|
1782
|
+
wal_write_mutex_.Unlock();
|
|
1685
1783
|
}
|
|
1686
1784
|
|
|
1687
|
-
if (io_s.ok() &&
|
|
1785
|
+
if (io_s.ok() && need_wal_dir_sync) {
|
|
1688
1786
|
// We only sync WAL directory the first time WAL syncing is
|
|
1689
1787
|
// requested, so that in case users never turn on WAL sync,
|
|
1690
1788
|
// we can avoid the disk I/O in the write code path.
|
|
@@ -1699,7 +1797,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
1699
1797
|
}
|
|
1700
1798
|
if (io_s.ok()) {
|
|
1701
1799
|
auto stats = default_cf_internal_stats_;
|
|
1702
|
-
if (
|
|
1800
|
+
if (need_wal_sync) {
|
|
1703
1801
|
stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
|
|
1704
1802
|
RecordTick(stats_, WAL_FILE_SYNCED);
|
|
1705
1803
|
}
|
|
@@ -1716,8 +1814,8 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
1716
1814
|
return io_s;
|
|
1717
1815
|
}
|
|
1718
1816
|
|
|
1719
|
-
IOStatus DBImpl::
|
|
1720
|
-
const WriteThread::WriteGroup& write_group, uint64_t*
|
|
1817
|
+
IOStatus DBImpl::ConcurrentWriteGroupToWAL(
|
|
1818
|
+
const WriteThread::WriteGroup& write_group, uint64_t* wal_used,
|
|
1721
1819
|
SequenceNumber* last_sequence, size_t seq_inc) {
|
|
1722
1820
|
IOStatus io_s;
|
|
1723
1821
|
|
|
@@ -1734,14 +1832,14 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
|
|
|
1734
1832
|
return io_s;
|
|
1735
1833
|
}
|
|
1736
1834
|
|
|
1737
|
-
// We need to lock
|
|
1835
|
+
// We need to lock wal_write_mutex_ since logs_ and alive_wal_files might be
|
|
1738
1836
|
// pushed back concurrently
|
|
1739
|
-
|
|
1837
|
+
wal_write_mutex_.Lock();
|
|
1740
1838
|
if (merged_batch == write_group.leader->batch) {
|
|
1741
|
-
write_group.leader->
|
|
1839
|
+
write_group.leader->wal_used = cur_wal_number_;
|
|
1742
1840
|
} else if (write_with_wal > 1) {
|
|
1743
1841
|
for (auto writer : write_group) {
|
|
1744
|
-
writer->
|
|
1842
|
+
writer->wal_used = cur_wal_number_;
|
|
1745
1843
|
}
|
|
1746
1844
|
}
|
|
1747
1845
|
*last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
|
@@ -1749,9 +1847,9 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
|
|
|
1749
1847
|
WriteBatchInternal::SetSequence(merged_batch, sequence);
|
|
1750
1848
|
|
|
1751
1849
|
log::Writer* log_writer = logs_.back().writer;
|
|
1752
|
-
|
|
1850
|
+
WalFileNumberSize& wal_file_number_size = alive_wal_files_.back();
|
|
1753
1851
|
|
|
1754
|
-
assert(log_writer->get_log_number() ==
|
|
1852
|
+
assert(log_writer->get_log_number() == wal_file_number_size.number);
|
|
1755
1853
|
|
|
1756
1854
|
uint64_t log_size;
|
|
1757
1855
|
|
|
@@ -1759,13 +1857,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
|
|
|
1759
1857
|
WriteOptions write_options;
|
|
1760
1858
|
write_options.rate_limiter_priority =
|
|
1761
1859
|
write_group.leader->rate_limiter_priority;
|
|
1762
|
-
io_s = WriteToWAL(*merged_batch, write_options, log_writer,
|
|
1763
|
-
&log_size,
|
|
1860
|
+
io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
|
|
1861
|
+
&log_size, wal_file_number_size, sequence);
|
|
1764
1862
|
if (to_be_cached_state) {
|
|
1765
1863
|
cached_recoverable_state_ = *to_be_cached_state;
|
|
1766
1864
|
cached_recoverable_state_empty_ = false;
|
|
1767
1865
|
}
|
|
1768
|
-
|
|
1866
|
+
wal_write_mutex_.Unlock();
|
|
1769
1867
|
|
|
1770
1868
|
if (io_s.ok()) {
|
|
1771
1869
|
const bool concurrent = true;
|
|
@@ -1793,7 +1891,7 @@ Status DBImpl::WriteRecoverableState() {
|
|
|
1793
1891
|
bool dont_care_bool;
|
|
1794
1892
|
SequenceNumber next_seq;
|
|
1795
1893
|
if (two_write_queues_) {
|
|
1796
|
-
|
|
1894
|
+
wal_write_mutex_.Lock();
|
|
1797
1895
|
}
|
|
1798
1896
|
SequenceNumber seq;
|
|
1799
1897
|
if (two_write_queues_) {
|
|
@@ -1808,13 +1906,17 @@ Status DBImpl::WriteRecoverableState() {
|
|
|
1808
1906
|
0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
|
|
1809
1907
|
&next_seq, &dont_care_bool, seq_per_batch_);
|
|
1810
1908
|
auto last_seq = next_seq - 1;
|
|
1811
|
-
if (
|
|
1812
|
-
|
|
1813
|
-
|
|
1909
|
+
if (status.ok()) { // Don't publish a partial batch write
|
|
1910
|
+
if (two_write_queues_) {
|
|
1911
|
+
versions_->FetchAddLastAllocatedSequence(last_seq - seq);
|
|
1912
|
+
versions_->SetLastPublishedSequence(last_seq);
|
|
1913
|
+
}
|
|
1914
|
+
versions_->SetLastSequence(last_seq);
|
|
1915
|
+
} else {
|
|
1916
|
+
HandleMemTableInsertFailure(status);
|
|
1814
1917
|
}
|
|
1815
|
-
versions_->SetLastSequence(last_seq);
|
|
1816
1918
|
if (two_write_queues_) {
|
|
1817
|
-
|
|
1919
|
+
wal_write_mutex_.Unlock();
|
|
1818
1920
|
}
|
|
1819
1921
|
if (status.ok() && recoverable_state_pre_release_callback_) {
|
|
1820
1922
|
const bool DISABLE_MEMTABLE = true;
|
|
@@ -1886,7 +1988,10 @@ void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
|
|
|
1886
1988
|
assert(immutable_db_options_.atomic_flush);
|
|
1887
1989
|
auto seq = versions_->LastSequence();
|
|
1888
1990
|
for (auto cfd : cfds) {
|
|
1889
|
-
cfd
|
|
1991
|
+
// cfd can be nullptr, see ScheduleFlushes()
|
|
1992
|
+
if (cfd) {
|
|
1993
|
+
cfd->imm()->AssignAtomicFlushSeq(seq);
|
|
1994
|
+
}
|
|
1890
1995
|
}
|
|
1891
1996
|
}
|
|
1892
1997
|
|
|
@@ -1895,11 +2000,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
|
|
1895
2000
|
assert(write_context != nullptr);
|
|
1896
2001
|
Status status;
|
|
1897
2002
|
|
|
1898
|
-
if (
|
|
2003
|
+
if (alive_wal_files_.begin()->getting_flushed) {
|
|
1899
2004
|
return status;
|
|
1900
2005
|
}
|
|
1901
2006
|
|
|
1902
|
-
auto oldest_alive_log =
|
|
2007
|
+
auto oldest_alive_log = alive_wal_files_.begin()->number;
|
|
1903
2008
|
bool flush_wont_release_oldest_log = false;
|
|
1904
2009
|
if (allow_2pc()) {
|
|
1905
2010
|
auto oldest_log_with_uncommitted_prep =
|
|
@@ -1929,14 +2034,14 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
|
|
1929
2034
|
// transactions then we cannot flush this log until those transactions are
|
|
1930
2035
|
// commited.
|
|
1931
2036
|
unable_to_release_oldest_log_ = false;
|
|
1932
|
-
|
|
2037
|
+
alive_wal_files_.begin()->getting_flushed = true;
|
|
1933
2038
|
}
|
|
1934
2039
|
|
|
1935
2040
|
ROCKS_LOG_INFO(
|
|
1936
2041
|
immutable_db_options_.info_log,
|
|
1937
2042
|
"Flushing all column families with data in WAL number %" PRIu64
|
|
1938
2043
|
". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
|
|
1939
|
-
oldest_alive_log,
|
|
2044
|
+
oldest_alive_log, wals_total_size_.LoadRelaxed(), GetMaxTotalWalSize());
|
|
1940
2045
|
// no need to refcount because drop is happening in write thread, so can't
|
|
1941
2046
|
// happen while we're in the write thread
|
|
1942
2047
|
autovector<ColumnFamilyData*> cfds;
|
|
@@ -2406,22 +2511,24 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2406
2511
|
// Do this without holding the dbmutex lock.
|
|
2407
2512
|
assert(versions_->prev_log_number() == 0);
|
|
2408
2513
|
if (two_write_queues_) {
|
|
2409
|
-
|
|
2514
|
+
wal_write_mutex_.Lock();
|
|
2410
2515
|
}
|
|
2411
|
-
bool creating_new_log = !
|
|
2516
|
+
bool creating_new_log = !wal_empty_;
|
|
2412
2517
|
if (two_write_queues_) {
|
|
2413
|
-
|
|
2518
|
+
wal_write_mutex_.Unlock();
|
|
2414
2519
|
}
|
|
2415
2520
|
uint64_t recycle_log_number = 0;
|
|
2416
2521
|
// If file deletion is disabled, don't recycle logs since it'll result in
|
|
2417
2522
|
// the file getting renamed
|
|
2418
2523
|
if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
|
|
2419
|
-
!
|
|
2420
|
-
recycle_log_number =
|
|
2524
|
+
!wal_recycle_files_.empty() && IsFileDeletionsEnabled()) {
|
|
2525
|
+
recycle_log_number = wal_recycle_files_.front();
|
|
2421
2526
|
}
|
|
2422
2527
|
uint64_t new_log_number =
|
|
2423
|
-
creating_new_log ? versions_->NewFileNumber() :
|
|
2424
|
-
|
|
2528
|
+
creating_new_log ? versions_->NewFileNumber() : cur_wal_number_;
|
|
2529
|
+
// For use outside of holding DB mutex
|
|
2530
|
+
const MutableCFOptions mutable_cf_options_copy =
|
|
2531
|
+
cfd->GetLatestMutableCFOptions();
|
|
2425
2532
|
|
|
2426
2533
|
// Set memtable_info for memtable sealed callback
|
|
2427
2534
|
// TODO: memtable_info for `new_imm`
|
|
@@ -2431,7 +2538,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2431
2538
|
memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
|
|
2432
2539
|
memtable_info.num_entries = cfd->mem()->NumEntries();
|
|
2433
2540
|
memtable_info.num_deletes = cfd->mem()->NumDeletion();
|
|
2434
|
-
if (!cfd->ioptions()
|
|
2541
|
+
if (!cfd->ioptions().persist_user_defined_timestamps &&
|
|
2435
2542
|
cfd->user_comparator()->timestamp_size() > 0) {
|
|
2436
2543
|
const Slice& newest_udt = cfd->mem()->GetNewestUDT();
|
|
2437
2544
|
memtable_info.newest_udt.assign(newest_udt.data(), newest_udt.size());
|
|
@@ -2440,13 +2547,22 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2440
2547
|
// flush happens before logging, but that should be ok.
|
|
2441
2548
|
int num_imm_unflushed = cfd->imm()->NumNotFlushed();
|
|
2442
2549
|
const auto preallocate_block_size =
|
|
2443
|
-
GetWalPreallocateBlockSize(
|
|
2550
|
+
GetWalPreallocateBlockSize(mutable_cf_options_copy.write_buffer_size);
|
|
2444
2551
|
mutex_.Unlock();
|
|
2445
2552
|
if (creating_new_log) {
|
|
2553
|
+
PredecessorWALInfo info;
|
|
2554
|
+
wal_write_mutex_.Lock();
|
|
2555
|
+
if (!logs_.empty()) {
|
|
2556
|
+
log::Writer* cur_log_writer = logs_.back().writer;
|
|
2557
|
+
info = PredecessorWALInfo(cur_log_writer->get_log_number(),
|
|
2558
|
+
cur_log_writer->file()->GetFileSize(),
|
|
2559
|
+
cur_log_writer->GetLastSeqnoRecorded());
|
|
2560
|
+
}
|
|
2561
|
+
wal_write_mutex_.Unlock();
|
|
2446
2562
|
// TODO: Write buffer size passed in should be max of all CF's instead
|
|
2447
2563
|
// of mutable_cf_options.write_buffer_size.
|
|
2448
2564
|
io_s = CreateWAL(write_options, new_log_number, recycle_log_number,
|
|
2449
|
-
preallocate_block_size, &new_log);
|
|
2565
|
+
preallocate_block_size, info, &new_log);
|
|
2450
2566
|
if (s.ok()) {
|
|
2451
2567
|
s = io_s;
|
|
2452
2568
|
}
|
|
@@ -2464,8 +2580,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2464
2580
|
} else {
|
|
2465
2581
|
seq = versions_->LastSequence();
|
|
2466
2582
|
}
|
|
2467
|
-
new_mem =
|
|
2468
|
-
|
|
2583
|
+
new_mem = cfd->ConstructNewMemtable(mutable_cf_options_copy,
|
|
2584
|
+
/*earliest_seq=*/seq);
|
|
2469
2585
|
context->superversion_context.NewSuperVersion();
|
|
2470
2586
|
|
|
2471
2587
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
@@ -2483,11 +2599,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2483
2599
|
// concurrent full purges don't delete the file while we're recycling it.
|
|
2484
2600
|
// To achieve that we hold the old log number in the recyclable list until
|
|
2485
2601
|
// after it has been renamed.
|
|
2486
|
-
assert(
|
|
2487
|
-
|
|
2602
|
+
assert(wal_recycle_files_.front() == recycle_log_number);
|
|
2603
|
+
wal_recycle_files_.pop_front();
|
|
2488
2604
|
}
|
|
2489
2605
|
if (s.ok() && creating_new_log) {
|
|
2490
|
-
InstrumentedMutexLock l(&
|
|
2606
|
+
InstrumentedMutexLock l(&wal_write_mutex_);
|
|
2491
2607
|
assert(new_log != nullptr);
|
|
2492
2608
|
if (!logs_.empty()) {
|
|
2493
2609
|
// Alway flush the buffer of the last log before switching to a new one
|
|
@@ -2509,11 +2625,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2509
2625
|
}
|
|
2510
2626
|
}
|
|
2511
2627
|
if (s.ok()) {
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
logs_.emplace_back(
|
|
2516
|
-
|
|
2628
|
+
cur_wal_number_ = new_log_number;
|
|
2629
|
+
wal_empty_ = true;
|
|
2630
|
+
wal_dir_synced_ = false;
|
|
2631
|
+
logs_.emplace_back(cur_wal_number_, new_log);
|
|
2632
|
+
alive_wal_files_.emplace_back(cur_wal_number_);
|
|
2517
2633
|
}
|
|
2518
2634
|
}
|
|
2519
2635
|
|
|
@@ -2544,7 +2660,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2544
2660
|
// obsolete. So we should track the WAL obsoletion event before actually
|
|
2545
2661
|
// updating the empty CF's log number.
|
|
2546
2662
|
uint64_t min_wal_number_to_keep =
|
|
2547
|
-
versions_->PreComputeMinLogNumberWithUnflushedData(
|
|
2663
|
+
versions_->PreComputeMinLogNumberWithUnflushedData(cur_wal_number_);
|
|
2548
2664
|
if (min_wal_number_to_keep >
|
|
2549
2665
|
versions_->GetWalSet().GetMinWalNumberToKeep()) {
|
|
2550
2666
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
@@ -2579,7 +2695,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2579
2695
|
|
|
2580
2696
|
for (auto cf : empty_cfs) {
|
|
2581
2697
|
if (cf->IsEmpty()) {
|
|
2582
|
-
cf->SetLogNumber(
|
|
2698
|
+
cf->SetLogNumber(cur_wal_number_);
|
|
2583
2699
|
// MEMPURGE: No need to change this, because new adds
|
|
2584
2700
|
// should still receive new sequence numbers.
|
|
2585
2701
|
cf->mem()->SetCreationSeq(versions_->LastSequence());
|
|
@@ -2596,14 +2712,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2596
2712
|
// advance the log number. no need to persist this in the manifest
|
|
2597
2713
|
if (cf->IsEmpty()) {
|
|
2598
2714
|
if (creating_new_log) {
|
|
2599
|
-
cf->SetLogNumber(
|
|
2715
|
+
cf->SetLogNumber(cur_wal_number_);
|
|
2600
2716
|
}
|
|
2601
2717
|
cf->mem()->SetCreationSeq(versions_->LastSequence());
|
|
2602
2718
|
}
|
|
2603
2719
|
}
|
|
2604
2720
|
}
|
|
2605
2721
|
|
|
2606
|
-
cfd->mem()->SetNextLogNumber(
|
|
2722
|
+
cfd->mem()->SetNextLogNumber(cur_wal_number_);
|
|
2607
2723
|
assert(new_mem != nullptr);
|
|
2608
2724
|
cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
|
|
2609
2725
|
if (new_imm) {
|
|
@@ -2615,13 +2731,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
|
|
|
2615
2731
|
// we always try to flush all immutable memtable. For atomic flush, these
|
|
2616
2732
|
// two memtables will be marked eligible for flush in the same call to
|
|
2617
2733
|
// AssignAtomicFlushSeq().
|
|
2618
|
-
new_imm->SetNextLogNumber(
|
|
2734
|
+
new_imm->SetNextLogNumber(cur_wal_number_);
|
|
2619
2735
|
cfd->imm()->Add(new_imm, &context->memtables_to_free_);
|
|
2620
2736
|
}
|
|
2621
2737
|
new_mem->Ref();
|
|
2622
2738
|
cfd->SetMemtable(new_mem);
|
|
2623
|
-
InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context
|
|
2624
|
-
mutable_cf_options);
|
|
2739
|
+
InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context);
|
|
2625
2740
|
|
|
2626
2741
|
// Notify client that memtable is sealed, now that we have successfully
|
|
2627
2742
|
// installed a new memtable
|