@nxtedition/rocksdb 8.2.7 → 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/deps/rocksdb/rocksdb/CMakeLists.txt +7 -1
- package/deps/rocksdb/rocksdb/Makefile +22 -19
- package/deps/rocksdb/rocksdb/TARGETS +8 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +157 -61
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +43 -92
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +632 -455
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +244 -149
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +41 -13
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +11 -1
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +216 -17
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +7 -5
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +279 -199
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +2 -1
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +159 -8
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +28 -2
- package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +8 -0
- package/deps/rocksdb/rocksdb/crash_test.mk +14 -0
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +18 -21
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +1 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +2 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/builder.cc +32 -7
- package/deps/rocksdb/rocksdb/db/c.cc +169 -6
- package/deps/rocksdb/rocksdb/db/c_test.c +104 -6
- package/deps/rocksdb/rocksdb/db/column_family.cc +98 -47
- package/deps/rocksdb/rocksdb/db/column_family.h +25 -2
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +213 -2
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +93 -23
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +33 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +7 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +107 -43
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +25 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -11
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +29 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +24 -31
- package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +3 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +19 -19
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/convenience.cc +20 -3
- package/deps/rocksdb/rocksdb/db/convenience_impl.h +15 -0
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +17 -0
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +17 -3
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +5 -0
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +15 -15
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +666 -44
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +2 -29
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +274 -1
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +40 -19
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +6 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +250 -116
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +51 -23
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +354 -96
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +6 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +50 -21
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +26 -13
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +13 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +61 -21
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -87
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +7 -1
- package/deps/rocksdb/rocksdb/db/db_iter.cc +2 -2
- package/deps/rocksdb/rocksdb/db/db_iter.h +1 -0
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +4 -11
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +39 -29
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +26 -36
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +106 -0
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +12 -3
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +279 -166
- package/deps/rocksdb/rocksdb/db/db_test.cc +48 -21
- package/deps/rocksdb/rocksdb/db/db_test2.cc +81 -12
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +14 -6
- package/deps/rocksdb/rocksdb/db/db_test_util.h +40 -0
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +13 -1
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +233 -0
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +143 -0
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -2
- package/deps/rocksdb/rocksdb/db/dbformat.cc +36 -0
- package/deps/rocksdb/rocksdb/db/dbformat.h +169 -20
- package/deps/rocksdb/rocksdb/db/dbformat_test.cc +129 -0
- package/deps/rocksdb/rocksdb/db/error_handler.cc +16 -0
- package/deps/rocksdb/rocksdb/db/error_handler.h +6 -3
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +4 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +2 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +17 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -4
- package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/file_indexer.cc +2 -4
- package/deps/rocksdb/rocksdb/db/flush_job.cc +101 -11
- package/deps/rocksdb/rocksdb/db/flush_job.h +24 -1
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +88 -11
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +2 -3
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +159 -91
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +19 -10
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +143 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -1
- package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
- package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/log_reader.h +3 -2
- package/deps/rocksdb/rocksdb/db/log_test.cc +17 -21
- package/deps/rocksdb/rocksdb/db/log_writer.cc +1 -1
- package/deps/rocksdb/rocksdb/db/log_writer.h +3 -2
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +4 -3
- package/deps/rocksdb/rocksdb/db/memtable.cc +52 -13
- package/deps/rocksdb/rocksdb/db/memtable.h +45 -1
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +44 -10
- package/deps/rocksdb/rocksdb/db/memtable_list.h +32 -1
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +90 -4
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -2
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +1 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +21 -4
- package/deps/rocksdb/rocksdb/db/repair_test.cc +143 -2
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -4
- package/deps/rocksdb/rocksdb/db/table_cache.cc +44 -35
- package/deps/rocksdb/rocksdb/db/table_cache.h +6 -6
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
- package/deps/rocksdb/rocksdb/db/version_builder.cc +0 -1
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +236 -204
- package/deps/rocksdb/rocksdb/db/version_edit.cc +66 -4
- package/deps/rocksdb/rocksdb/db/version_edit.h +48 -6
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +80 -8
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +12 -0
- package/deps/rocksdb/rocksdb/db/version_edit_test.cc +86 -17
- package/deps/rocksdb/rocksdb/db/version_set.cc +136 -41
- package/deps/rocksdb/rocksdb/db/version_set.h +28 -7
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +25 -15
- package/deps/rocksdb/rocksdb/db/write_batch.cc +11 -0
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +3 -0
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +16 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +42 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +32 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +247 -120
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +9 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +13 -6
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +15 -27
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +264 -69
- package/deps/rocksdb/rocksdb/env/env.cc +1 -2
- package/deps/rocksdb/rocksdb/env/env_encryption.cc +11 -165
- package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +0 -17
- package/deps/rocksdb/rocksdb/env/env_posix.cc +6 -2
- package/deps/rocksdb/rocksdb/env/env_test.cc +86 -2
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
- package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +78 -0
- package/deps/rocksdb/rocksdb/env/unique_id_gen.h +34 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +15 -4
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +52 -43
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +34 -18
- package/deps/rocksdb/rocksdb/file/file_util.cc +10 -5
- package/deps/rocksdb/rocksdb/file/file_util.h +13 -1
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +724 -79
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +64 -33
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -16
- package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +23 -12
- package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +153 -88
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +70 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +50 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +16 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +55 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +32 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +9 -109
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +90 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +85 -17
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +13 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +5 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +21 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +7 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +33 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +33 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +0 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +3 -0
- package/deps/rocksdb/rocksdb/memory/arena_test.cc +18 -11
- package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +2 -1
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +69 -34
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +16 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +10 -0
- package/deps/rocksdb/rocksdb/options/cf_options.cc +19 -0
- package/deps/rocksdb/rocksdb/options/cf_options.h +10 -2
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -1
- package/deps/rocksdb/rocksdb/options/db_options.cc +7 -0
- package/deps/rocksdb/rocksdb/options/db_options.h +1 -0
- package/deps/rocksdb/rocksdb/options/options.cc +15 -1
- package/deps/rocksdb/rocksdb/options/options_helper.cc +6 -0
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -3
- package/deps/rocksdb/rocksdb/options/options_test.cc +8 -0
- package/deps/rocksdb/rocksdb/port/mmap.h +20 -0
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +27 -12
- package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -1
- package/deps/rocksdb/rocksdb/src.mk +3 -0
- package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/block.cc +48 -22
- package/deps/rocksdb/rocksdb/table/block_based/block.h +60 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +115 -42
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +60 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +62 -44
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +36 -14
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +38 -15
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +219 -51
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +41 -8
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +50 -21
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +11 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +195 -55
- package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +31 -16
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +97 -58
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +6 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +27 -12
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +3 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +114 -70
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +9 -6
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +15 -3
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +6 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +11 -11
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -0
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +1 -0
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +6 -2
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +1 -2
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +2 -3
- package/deps/rocksdb/rocksdb/table/format.cc +175 -33
- package/deps/rocksdb/rocksdb/table/format.h +63 -10
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +10 -2
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +12 -4
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -0
- package/deps/rocksdb/rocksdb/table/mock_table.cc +8 -3
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +10 -5
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +10 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +1 -2
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +3 -3
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +12 -3
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +26 -1
- package/deps/rocksdb/rocksdb/table/table_builder.h +6 -2
- package/deps/rocksdb/rocksdb/table/table_properties.cc +6 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +52 -22
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +19 -7
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +3 -1
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +29 -0
- package/deps/rocksdb/rocksdb/test_util/testutil.h +19 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +65 -26
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +0 -1
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +4 -0
- package/deps/rocksdb/rocksdb/unreleased_history/README.txt +73 -0
- package/deps/rocksdb/rocksdb/unreleased_history/add.sh +27 -0
- package/deps/rocksdb/rocksdb/unreleased_history/behavior_changes/.gitkeep +0 -0
- package/deps/rocksdb/rocksdb/unreleased_history/bug_fixes/.gitkeep +0 -0
- package/deps/rocksdb/rocksdb/unreleased_history/new_features/.gitkeep +0 -0
- package/deps/rocksdb/rocksdb/unreleased_history/performance_improvements/.gitkeep +0 -0
- package/deps/rocksdb/rocksdb/unreleased_history/public_api_changes/.gitkeep +0 -0
- package/deps/rocksdb/rocksdb/unreleased_history/release.sh +104 -0
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +5 -0
- package/deps/rocksdb/rocksdb/util/bloom_impl.h +3 -3
- package/deps/rocksdb/rocksdb/util/cast_util.h +14 -0
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -0
- package/deps/rocksdb/rocksdb/util/comparator.cc +29 -7
- package/deps/rocksdb/rocksdb/util/compression.cc +4 -4
- package/deps/rocksdb/rocksdb/util/compression.h +110 -32
- package/deps/rocksdb/rocksdb/util/core_local.h +2 -1
- package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +4 -4
- package/deps/rocksdb/rocksdb/util/filelock_test.cc +3 -0
- package/deps/rocksdb/rocksdb/util/hash.h +7 -3
- package/deps/rocksdb/rocksdb/util/hash_test.cc +44 -0
- package/deps/rocksdb/rocksdb/util/math.h +58 -6
- package/deps/rocksdb/rocksdb/util/math128.h +29 -7
- package/deps/rocksdb/rocksdb/util/mutexlock.h +35 -27
- package/deps/rocksdb/rocksdb/util/single_thread_executor.h +1 -0
- package/deps/rocksdb/rocksdb/util/stop_watch.h +1 -1
- package/deps/rocksdb/rocksdb/util/thread_operation.h +8 -1
- package/deps/rocksdb/rocksdb/util/udt_util.cc +343 -0
- package/deps/rocksdb/rocksdb/util/udt_util.h +173 -1
- package/deps/rocksdb/rocksdb/util/udt_util_test.cc +447 -0
- package/deps/rocksdb/rocksdb/util/write_batch_util.cc +25 -0
- package/deps/rocksdb/rocksdb/util/write_batch_util.h +80 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +69 -25
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +7 -6
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +2 -3
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +6 -11
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -2
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +4 -5
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +2 -1
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +2 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc +23 -8
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +9 -6
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +37 -12
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +231 -33
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +0 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +76 -20
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +18 -9
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +40 -23
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +13 -12
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +7 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +41 -11
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +71 -24
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +19 -4
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +60 -107
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +39 -11
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +14 -8
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +10 -5
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +6 -6
- package/deps/rocksdb/rocksdb.gyp +2 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -12,10 +12,13 @@
|
|
|
12
12
|
|
|
13
13
|
#include <atomic>
|
|
14
14
|
#include <functional>
|
|
15
|
+
#include <memory>
|
|
15
16
|
#include <mutex>
|
|
16
17
|
#include <thread>
|
|
17
18
|
|
|
18
19
|
#include "port/port.h"
|
|
20
|
+
#include "util/fastrange.h"
|
|
21
|
+
#include "util/hash.h"
|
|
19
22
|
|
|
20
23
|
namespace ROCKSDB_NAMESPACE {
|
|
21
24
|
|
|
@@ -129,10 +132,25 @@ class SpinMutex {
|
|
|
129
132
|
std::atomic<bool> locked_;
|
|
130
133
|
};
|
|
131
134
|
|
|
132
|
-
//
|
|
135
|
+
// For preventing false sharing, especially for mutexes.
|
|
136
|
+
// NOTE: if a mutex is less than half the size of a cache line, it would
|
|
137
|
+
// make more sense for Striped structure below to pack more than one mutex
|
|
138
|
+
// into each cache line, as this would only reduce contention for the same
|
|
139
|
+
// amount of space and cache sharing. However, a mutex is often 40 bytes out
|
|
140
|
+
// of a 64 byte cache line.
|
|
133
141
|
template <class T>
|
|
134
|
-
struct ALIGN_AS(CACHE_LINE_SIZE)
|
|
135
|
-
T
|
|
142
|
+
struct ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedWrapper {
|
|
143
|
+
T obj_;
|
|
144
|
+
};
|
|
145
|
+
template <class T>
|
|
146
|
+
struct Unwrap {
|
|
147
|
+
using type = T;
|
|
148
|
+
static type &Go(T &t) { return t; }
|
|
149
|
+
};
|
|
150
|
+
template <class T>
|
|
151
|
+
struct Unwrap<CacheAlignedWrapper<T>> {
|
|
152
|
+
using type = T;
|
|
153
|
+
static type &Go(CacheAlignedWrapper<T> &t) { return t.obj_; }
|
|
136
154
|
};
|
|
137
155
|
|
|
138
156
|
//
|
|
@@ -144,38 +162,28 @@ struct ALIGN_AS(CACHE_LINE_SIZE) LockData {
|
|
|
144
162
|
// single lock and allowing independent operations to lock different stripes and
|
|
145
163
|
// proceed concurrently, instead of creating contention for a single lock.
|
|
146
164
|
//
|
|
147
|
-
template <class T, class
|
|
165
|
+
template <class T, class Key = Slice, class Hash = SliceNPHasher64>
|
|
148
166
|
class Striped {
|
|
149
167
|
public:
|
|
150
|
-
Striped(size_t
|
|
151
|
-
:
|
|
152
|
-
locks_ = reinterpret_cast<LockData<T> *>(
|
|
153
|
-
port::cacheline_aligned_alloc(sizeof(LockData<T>) * stripes));
|
|
154
|
-
for (size_t i = 0; i < stripes; i++) {
|
|
155
|
-
new (&locks_[i]) LockData<T>();
|
|
156
|
-
}
|
|
157
|
-
}
|
|
168
|
+
explicit Striped(size_t stripe_count)
|
|
169
|
+
: stripe_count_(stripe_count), data_(new T[stripe_count]) {}
|
|
158
170
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
locks_[i].~LockData<T>();
|
|
164
|
-
}
|
|
165
|
-
port::cacheline_aligned_free(locks_);
|
|
166
|
-
}
|
|
171
|
+
using Unwrapped = typename Unwrap<T>::type;
|
|
172
|
+
Unwrapped &Get(const Key &key, uint64_t seed = 0) {
|
|
173
|
+
size_t index = FastRangeGeneric(hash_(key, seed), stripe_count_);
|
|
174
|
+
return Unwrap<T>::Go(data_[index]);
|
|
167
175
|
}
|
|
168
176
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
return
|
|
177
|
+
size_t ApproximateMemoryUsage() const {
|
|
178
|
+
// NOTE: could use malloc_usable_size() here, but that could count unmapped
|
|
179
|
+
// pages and could mess up unit test OccLockBucketsTest::CacheAligned
|
|
180
|
+
return sizeof(*this) + stripe_count_ * sizeof(T);
|
|
173
181
|
}
|
|
174
182
|
|
|
175
183
|
private:
|
|
176
|
-
size_t
|
|
177
|
-
|
|
178
|
-
|
|
184
|
+
size_t stripe_count_;
|
|
185
|
+
std::unique_ptr<T[]> data_;
|
|
186
|
+
Hash hash_;
|
|
179
187
|
};
|
|
180
188
|
|
|
181
189
|
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -32,7 +32,7 @@ class StopWatch {
|
|
|
32
32
|
elapsed_(elapsed),
|
|
33
33
|
overwrite_(overwrite),
|
|
34
34
|
stats_enabled_(statistics &&
|
|
35
|
-
statistics->get_stats_level()
|
|
35
|
+
statistics->get_stats_level() >
|
|
36
36
|
StatsLevel::kExceptTimers &&
|
|
37
37
|
(hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX ||
|
|
38
38
|
hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX)),
|
|
@@ -39,7 +39,14 @@ static OperationInfo global_operation_table[] = {
|
|
|
39
39
|
{ThreadStatus::OP_UNKNOWN, ""},
|
|
40
40
|
{ThreadStatus::OP_COMPACTION, "Compaction"},
|
|
41
41
|
{ThreadStatus::OP_FLUSH, "Flush"},
|
|
42
|
-
{ThreadStatus::OP_DBOPEN, "DBOpen"}
|
|
42
|
+
{ThreadStatus::OP_DBOPEN, "DBOpen"},
|
|
43
|
+
{ThreadStatus::OP_GET, "Get"},
|
|
44
|
+
{ThreadStatus::OP_MULTIGET, "MultiGet"},
|
|
45
|
+
{ThreadStatus::OP_DBITERATOR, "DBIterator"},
|
|
46
|
+
{ThreadStatus::OP_VERIFY_DB_CHECKSUM, "VerifyDBChecksum"},
|
|
47
|
+
{ThreadStatus::OP_VERIFY_FILE_CHECKSUMS, "VerifyFileChecksums"},
|
|
48
|
+
|
|
49
|
+
};
|
|
43
50
|
|
|
44
51
|
struct OperationStageInfo {
|
|
45
52
|
const ThreadStatus::OperationStage stage;
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
//
|
|
3
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
4
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
5
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
6
|
+
|
|
7
|
+
#include "util/udt_util.h"
|
|
8
|
+
|
|
9
|
+
#include "db/dbformat.h"
|
|
10
|
+
#include "rocksdb/types.h"
|
|
11
|
+
#include "util/write_batch_util.h"
|
|
12
|
+
|
|
13
|
+
namespace ROCKSDB_NAMESPACE {
|
|
14
|
+
namespace {
|
|
15
|
+
enum class RecoveryType {
|
|
16
|
+
kNoop,
|
|
17
|
+
kUnrecoverable,
|
|
18
|
+
kStripTimestamp,
|
|
19
|
+
kPadTimestamp,
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
RecoveryType GetRecoveryType(const size_t running_ts_sz,
|
|
23
|
+
const std::optional<size_t>& recorded_ts_sz) {
|
|
24
|
+
if (running_ts_sz == 0) {
|
|
25
|
+
if (!recorded_ts_sz.has_value()) {
|
|
26
|
+
// A column family id not recorded is equivalent to that column family has
|
|
27
|
+
// zero timestamp size.
|
|
28
|
+
return RecoveryType::kNoop;
|
|
29
|
+
}
|
|
30
|
+
return RecoveryType::kStripTimestamp;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
assert(running_ts_sz != 0);
|
|
34
|
+
|
|
35
|
+
if (!recorded_ts_sz.has_value()) {
|
|
36
|
+
return RecoveryType::kPadTimestamp;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if (running_ts_sz != *recorded_ts_sz) {
|
|
40
|
+
return RecoveryType::kUnrecoverable;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return RecoveryType::kNoop;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
bool AllRunningColumnFamiliesConsistent(
|
|
47
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
48
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz) {
|
|
49
|
+
for (const auto& [cf_id, ts_sz] : running_ts_sz) {
|
|
50
|
+
auto record_it = record_ts_sz.find(cf_id);
|
|
51
|
+
RecoveryType recovery_type =
|
|
52
|
+
GetRecoveryType(ts_sz, record_it != record_ts_sz.end()
|
|
53
|
+
? std::optional<size_t>(record_it->second)
|
|
54
|
+
: std::nullopt);
|
|
55
|
+
if (recovery_type != RecoveryType::kNoop) {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return true;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
Status CheckWriteBatchTimestampSizeConsistency(
|
|
63
|
+
const WriteBatch* batch,
|
|
64
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
65
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
|
|
66
|
+
TimestampSizeConsistencyMode check_mode, bool* ts_need_recovery) {
|
|
67
|
+
std::vector<uint32_t> column_family_ids;
|
|
68
|
+
Status status =
|
|
69
|
+
CollectColumnFamilyIdsFromWriteBatch(*batch, &column_family_ids);
|
|
70
|
+
if (!status.ok()) {
|
|
71
|
+
return status;
|
|
72
|
+
}
|
|
73
|
+
for (const auto& cf_id : column_family_ids) {
|
|
74
|
+
auto running_iter = running_ts_sz.find(cf_id);
|
|
75
|
+
if (running_iter == running_ts_sz.end()) {
|
|
76
|
+
// Ignore dropped column family referred to in a WriteBatch regardless of
|
|
77
|
+
// its consistency.
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
auto record_iter = record_ts_sz.find(cf_id);
|
|
81
|
+
RecoveryType recovery_type = GetRecoveryType(
|
|
82
|
+
running_iter->second, record_iter != record_ts_sz.end()
|
|
83
|
+
? std::optional<size_t>(record_iter->second)
|
|
84
|
+
: std::nullopt);
|
|
85
|
+
if (recovery_type != RecoveryType::kNoop) {
|
|
86
|
+
if (check_mode == TimestampSizeConsistencyMode::kVerifyConsistency) {
|
|
87
|
+
return Status::InvalidArgument(
|
|
88
|
+
"WriteBatch contains timestamp size inconsistency.");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (recovery_type == RecoveryType::kUnrecoverable) {
|
|
92
|
+
return Status::InvalidArgument(
|
|
93
|
+
"WriteBatch contains unrecoverable timestamp size inconsistency.");
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// If any column family needs reconciliation, it will mark the whole
|
|
97
|
+
// WriteBatch to need recovery and rebuilt.
|
|
98
|
+
*ts_need_recovery = true;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return Status::OK();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
enum class ToggleUDT {
|
|
105
|
+
kUnchanged,
|
|
106
|
+
kEnableUDT,
|
|
107
|
+
kDisableUDT,
|
|
108
|
+
kInvalidChange,
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
ToggleUDT CompareComparator(const Comparator* new_comparator,
|
|
112
|
+
const std::string& old_comparator_name) {
|
|
113
|
+
static const char* kUDTSuffix = ".u64ts";
|
|
114
|
+
static const Slice kSuffixSlice = kUDTSuffix;
|
|
115
|
+
static const size_t kSuffixSize = 6;
|
|
116
|
+
size_t ts_sz = new_comparator->timestamp_size();
|
|
117
|
+
(void)ts_sz;
|
|
118
|
+
Slice new_ucmp_name(new_comparator->Name());
|
|
119
|
+
Slice old_ucmp_name(old_comparator_name);
|
|
120
|
+
if (new_ucmp_name.compare(old_ucmp_name) == 0) {
|
|
121
|
+
return ToggleUDT::kUnchanged;
|
|
122
|
+
}
|
|
123
|
+
if (new_ucmp_name.size() == old_ucmp_name.size() + kSuffixSize &&
|
|
124
|
+
new_ucmp_name.starts_with(old_ucmp_name) &&
|
|
125
|
+
new_ucmp_name.ends_with(kSuffixSlice)) {
|
|
126
|
+
assert(ts_sz == 8);
|
|
127
|
+
return ToggleUDT::kEnableUDT;
|
|
128
|
+
}
|
|
129
|
+
if (old_ucmp_name.size() == new_ucmp_name.size() + kSuffixSize &&
|
|
130
|
+
old_ucmp_name.starts_with(new_ucmp_name) &&
|
|
131
|
+
old_ucmp_name.ends_with(kSuffixSlice)) {
|
|
132
|
+
assert(ts_sz == 0);
|
|
133
|
+
return ToggleUDT::kDisableUDT;
|
|
134
|
+
}
|
|
135
|
+
return ToggleUDT::kInvalidChange;
|
|
136
|
+
}
|
|
137
|
+
} // namespace
|
|
138
|
+
|
|
139
|
+
TimestampRecoveryHandler::TimestampRecoveryHandler(
|
|
140
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
141
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz)
|
|
142
|
+
: running_ts_sz_(running_ts_sz),
|
|
143
|
+
record_ts_sz_(record_ts_sz),
|
|
144
|
+
new_batch_(new WriteBatch()),
|
|
145
|
+
handler_valid_(true),
|
|
146
|
+
new_batch_diff_from_orig_batch_(false) {}
|
|
147
|
+
|
|
148
|
+
Status TimestampRecoveryHandler::PutCF(uint32_t cf, const Slice& key,
|
|
149
|
+
const Slice& value) {
|
|
150
|
+
std::string new_key_buf;
|
|
151
|
+
Slice new_key;
|
|
152
|
+
Status status =
|
|
153
|
+
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
154
|
+
if (!status.ok()) {
|
|
155
|
+
return status;
|
|
156
|
+
}
|
|
157
|
+
return WriteBatchInternal::Put(new_batch_.get(), cf, new_key, value);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
Status TimestampRecoveryHandler::DeleteCF(uint32_t cf, const Slice& key) {
|
|
161
|
+
std::string new_key_buf;
|
|
162
|
+
Slice new_key;
|
|
163
|
+
Status status =
|
|
164
|
+
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
165
|
+
if (!status.ok()) {
|
|
166
|
+
return status;
|
|
167
|
+
}
|
|
168
|
+
return WriteBatchInternal::Delete(new_batch_.get(), cf, new_key);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
Status TimestampRecoveryHandler::SingleDeleteCF(uint32_t cf, const Slice& key) {
|
|
172
|
+
std::string new_key_buf;
|
|
173
|
+
Slice new_key;
|
|
174
|
+
Status status =
|
|
175
|
+
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
176
|
+
if (!status.ok()) {
|
|
177
|
+
return status;
|
|
178
|
+
}
|
|
179
|
+
return WriteBatchInternal::SingleDelete(new_batch_.get(), cf, new_key);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
Status TimestampRecoveryHandler::DeleteRangeCF(uint32_t cf,
|
|
183
|
+
const Slice& begin_key,
|
|
184
|
+
const Slice& end_key) {
|
|
185
|
+
std::string new_begin_key_buf;
|
|
186
|
+
Slice new_begin_key;
|
|
187
|
+
std::string new_end_key_buf;
|
|
188
|
+
Slice new_end_key;
|
|
189
|
+
Status status = ReconcileTimestampDiscrepancy(
|
|
190
|
+
cf, begin_key, &new_begin_key_buf, &new_begin_key);
|
|
191
|
+
if (!status.ok()) {
|
|
192
|
+
return status;
|
|
193
|
+
}
|
|
194
|
+
status = ReconcileTimestampDiscrepancy(cf, end_key, &new_end_key_buf,
|
|
195
|
+
&new_end_key);
|
|
196
|
+
if (!status.ok()) {
|
|
197
|
+
return status;
|
|
198
|
+
}
|
|
199
|
+
return WriteBatchInternal::DeleteRange(new_batch_.get(), cf, new_begin_key,
|
|
200
|
+
new_end_key);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
Status TimestampRecoveryHandler::MergeCF(uint32_t cf, const Slice& key,
|
|
204
|
+
const Slice& value) {
|
|
205
|
+
std::string new_key_buf;
|
|
206
|
+
Slice new_key;
|
|
207
|
+
Status status =
|
|
208
|
+
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
209
|
+
if (!status.ok()) {
|
|
210
|
+
return status;
|
|
211
|
+
}
|
|
212
|
+
return WriteBatchInternal::Merge(new_batch_.get(), cf, new_key, value);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
Status TimestampRecoveryHandler::PutBlobIndexCF(uint32_t cf, const Slice& key,
|
|
216
|
+
const Slice& value) {
|
|
217
|
+
std::string new_key_buf;
|
|
218
|
+
Slice new_key;
|
|
219
|
+
Status status =
|
|
220
|
+
ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key);
|
|
221
|
+
if (!status.ok()) {
|
|
222
|
+
return status;
|
|
223
|
+
}
|
|
224
|
+
return WriteBatchInternal::PutBlobIndex(new_batch_.get(), cf, new_key, value);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
Status TimestampRecoveryHandler::ReconcileTimestampDiscrepancy(
|
|
228
|
+
uint32_t cf, const Slice& key, std::string* new_key_buf, Slice* new_key) {
|
|
229
|
+
assert(handler_valid_);
|
|
230
|
+
auto running_iter = running_ts_sz_.find(cf);
|
|
231
|
+
if (running_iter == running_ts_sz_.end()) {
|
|
232
|
+
// The column family referred to by the WriteBatch is no longer running.
|
|
233
|
+
// Copy over the entry as is to the new WriteBatch.
|
|
234
|
+
*new_key = key;
|
|
235
|
+
return Status::OK();
|
|
236
|
+
}
|
|
237
|
+
size_t running_ts_sz = running_iter->second;
|
|
238
|
+
auto record_iter = record_ts_sz_.find(cf);
|
|
239
|
+
std::optional<size_t> record_ts_sz =
|
|
240
|
+
record_iter != record_ts_sz_.end()
|
|
241
|
+
? std::optional<size_t>(record_iter->second)
|
|
242
|
+
: std::nullopt;
|
|
243
|
+
RecoveryType recovery_type = GetRecoveryType(running_ts_sz, record_ts_sz);
|
|
244
|
+
|
|
245
|
+
switch (recovery_type) {
|
|
246
|
+
case RecoveryType::kNoop:
|
|
247
|
+
*new_key = key;
|
|
248
|
+
break;
|
|
249
|
+
case RecoveryType::kStripTimestamp:
|
|
250
|
+
assert(record_ts_sz.has_value());
|
|
251
|
+
*new_key = StripTimestampFromUserKey(key, *record_ts_sz);
|
|
252
|
+
new_batch_diff_from_orig_batch_ = true;
|
|
253
|
+
break;
|
|
254
|
+
case RecoveryType::kPadTimestamp:
|
|
255
|
+
AppendKeyWithMinTimestamp(new_key_buf, key, running_ts_sz);
|
|
256
|
+
*new_key = *new_key_buf;
|
|
257
|
+
new_batch_diff_from_orig_batch_ = true;
|
|
258
|
+
break;
|
|
259
|
+
case RecoveryType::kUnrecoverable:
|
|
260
|
+
return Status::InvalidArgument(
|
|
261
|
+
"Unrecoverable timestamp size inconsistency encountered by "
|
|
262
|
+
"TimestampRecoveryHandler.");
|
|
263
|
+
default:
|
|
264
|
+
assert(false);
|
|
265
|
+
}
|
|
266
|
+
return Status::OK();
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
Status HandleWriteBatchTimestampSizeDifference(
|
|
270
|
+
const WriteBatch* batch,
|
|
271
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
272
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
|
|
273
|
+
TimestampSizeConsistencyMode check_mode,
|
|
274
|
+
std::unique_ptr<WriteBatch>* new_batch) {
|
|
275
|
+
// Quick path to bypass checking the WriteBatch.
|
|
276
|
+
if (AllRunningColumnFamiliesConsistent(running_ts_sz, record_ts_sz)) {
|
|
277
|
+
return Status::OK();
|
|
278
|
+
}
|
|
279
|
+
bool need_recovery = false;
|
|
280
|
+
Status status = CheckWriteBatchTimestampSizeConsistency(
|
|
281
|
+
batch, running_ts_sz, record_ts_sz, check_mode, &need_recovery);
|
|
282
|
+
if (!status.ok()) {
|
|
283
|
+
return status;
|
|
284
|
+
} else if (need_recovery) {
|
|
285
|
+
assert(new_batch);
|
|
286
|
+
SequenceNumber sequence = WriteBatchInternal::Sequence(batch);
|
|
287
|
+
TimestampRecoveryHandler recovery_handler(running_ts_sz, record_ts_sz);
|
|
288
|
+
status = batch->Iterate(&recovery_handler);
|
|
289
|
+
if (!status.ok()) {
|
|
290
|
+
return status;
|
|
291
|
+
} else {
|
|
292
|
+
*new_batch = recovery_handler.TransferNewBatch();
|
|
293
|
+
WriteBatchInternal::SetSequence(new_batch->get(), sequence);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
return Status::OK();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
Status ValidateUserDefinedTimestampsOptions(
|
|
300
|
+
const Comparator* new_comparator, const std::string& old_comparator_name,
|
|
301
|
+
bool new_persist_udt, bool old_persist_udt,
|
|
302
|
+
bool* mark_sst_files_has_no_udt) {
|
|
303
|
+
size_t ts_sz = new_comparator->timestamp_size();
|
|
304
|
+
ToggleUDT res = CompareComparator(new_comparator, old_comparator_name);
|
|
305
|
+
switch (res) {
|
|
306
|
+
case ToggleUDT::kUnchanged:
|
|
307
|
+
if (old_persist_udt == new_persist_udt) {
|
|
308
|
+
return Status::OK();
|
|
309
|
+
}
|
|
310
|
+
if (ts_sz == 0) {
|
|
311
|
+
return Status::OK();
|
|
312
|
+
}
|
|
313
|
+
return Status::InvalidArgument(
|
|
314
|
+
"Cannot toggle the persist_user_defined_timestamps flag for a column "
|
|
315
|
+
"family with user-defined timestamps feature enabled.");
|
|
316
|
+
case ToggleUDT::kEnableUDT:
|
|
317
|
+
if (!new_persist_udt) {
|
|
318
|
+
*mark_sst_files_has_no_udt = true;
|
|
319
|
+
return Status::OK();
|
|
320
|
+
}
|
|
321
|
+
return Status::InvalidArgument(
|
|
322
|
+
"Cannot open a column family and enable user-defined timestamps "
|
|
323
|
+
"feature without setting persist_user_defined_timestamps flag to "
|
|
324
|
+
"false.");
|
|
325
|
+
case ToggleUDT::kDisableUDT:
|
|
326
|
+
if (!old_persist_udt) {
|
|
327
|
+
return Status::OK();
|
|
328
|
+
}
|
|
329
|
+
return Status::InvalidArgument(
|
|
330
|
+
"Cannot open a column family and disable user-defined timestamps "
|
|
331
|
+
"feature if its existing persist_user_defined_timestamps flag is not "
|
|
332
|
+
"false.");
|
|
333
|
+
case ToggleUDT::kInvalidChange:
|
|
334
|
+
return Status::InvalidArgument(
|
|
335
|
+
new_comparator->Name(),
|
|
336
|
+
"does not match existing comparator " + old_comparator_name);
|
|
337
|
+
default:
|
|
338
|
+
break;
|
|
339
|
+
}
|
|
340
|
+
return Status::InvalidArgument(
|
|
341
|
+
"Unsupported user defined timestamps settings change.");
|
|
342
|
+
}
|
|
343
|
+
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -1,15 +1,22 @@
|
|
|
1
|
-
// Copyright (c)
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
//
|
|
2
3
|
// This source code is licensed under both the GPLv2 (found in the
|
|
3
4
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
5
|
// (found in the LICENSE.Apache file in the root directory).
|
|
5
6
|
|
|
6
7
|
#pragma once
|
|
8
|
+
#include <memory>
|
|
9
|
+
#include <optional>
|
|
7
10
|
#include <sstream>
|
|
11
|
+
#include <unordered_map>
|
|
8
12
|
#include <vector>
|
|
9
13
|
|
|
14
|
+
#include "db/write_batch_internal.h"
|
|
10
15
|
#include "rocksdb/slice.h"
|
|
11
16
|
#include "rocksdb/status.h"
|
|
17
|
+
#include "rocksdb/write_batch.h"
|
|
12
18
|
#include "util/coding.h"
|
|
19
|
+
#include "util/hash_containers.h"
|
|
13
20
|
|
|
14
21
|
namespace ROCKSDB_NAMESPACE {
|
|
15
22
|
|
|
@@ -74,4 +81,169 @@ class UserDefinedTimestampSizeRecord {
|
|
|
74
81
|
std::vector<std::pair<uint32_t, size_t>> cf_to_ts_sz_;
|
|
75
82
|
};
|
|
76
83
|
|
|
84
|
+
// This handler is used to recover a WriteBatch read from WAL logs during
|
|
85
|
+
// recovery. It does a best-effort recovery if the column families contained in
|
|
86
|
+
// the WriteBatch have inconsistency between the recorded timestamp size and the
|
|
87
|
+
// running timestamp size. And creates a new WriteBatch that are consistent with
|
|
88
|
+
// the running timestamp size with entries from the original WriteBatch.
|
|
89
|
+
//
|
|
90
|
+
// Note that for a WriteBatch with no inconsistency, a new WriteBatch is created
|
|
91
|
+
// nonetheless, and it should be exactly the same as the original WriteBatch.
|
|
92
|
+
//
|
|
93
|
+
// To access the new WriteBatch, invoke `TransferNewBatch` after calling
|
|
94
|
+
// `Iterate`. The handler becomes invalid afterwards.
|
|
95
|
+
//
|
|
96
|
+
// For the user key in each entry, the best effort recovery means:
|
|
97
|
+
// 1) If recorded timestamp size is 0, running timestamp size is > 0, a min
|
|
98
|
+
// timestamp of length running timestamp size is padded to the user key.
|
|
99
|
+
// 2) If recorded timestamp size is > 0, running timestamp size is 0, the last
|
|
100
|
+
// bytes of length recorded timestamp size is stripped from user key.
|
|
101
|
+
// 3) If recorded timestamp size is the same as running timestamp size, no-op.
|
|
102
|
+
// 4) If recorded timestamp size and running timestamp size are both non-zero
|
|
103
|
+
// but not equal, return Status::InvalidArgument.
|
|
104
|
+
class TimestampRecoveryHandler : public WriteBatch::Handler {
|
|
105
|
+
public:
|
|
106
|
+
TimestampRecoveryHandler(const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
107
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz);
|
|
108
|
+
|
|
109
|
+
~TimestampRecoveryHandler() override {}
|
|
110
|
+
|
|
111
|
+
// No copy or move.
|
|
112
|
+
TimestampRecoveryHandler(const TimestampRecoveryHandler&) = delete;
|
|
113
|
+
TimestampRecoveryHandler(TimestampRecoveryHandler&&) = delete;
|
|
114
|
+
TimestampRecoveryHandler& operator=(const TimestampRecoveryHandler&) = delete;
|
|
115
|
+
TimestampRecoveryHandler& operator=(TimestampRecoveryHandler&&) = delete;
|
|
116
|
+
|
|
117
|
+
Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override;
|
|
118
|
+
|
|
119
|
+
Status DeleteCF(uint32_t cf, const Slice& key) override;
|
|
120
|
+
|
|
121
|
+
Status SingleDeleteCF(uint32_t cf, const Slice& key) override;
|
|
122
|
+
|
|
123
|
+
Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
|
|
124
|
+
const Slice& end_key) override;
|
|
125
|
+
|
|
126
|
+
Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override;
|
|
127
|
+
|
|
128
|
+
Status PutBlobIndexCF(uint32_t cf, const Slice& key,
|
|
129
|
+
const Slice& value) override;
|
|
130
|
+
|
|
131
|
+
Status MarkBeginPrepare(bool) override { return Status::OK(); }
|
|
132
|
+
|
|
133
|
+
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
|
|
134
|
+
|
|
135
|
+
Status MarkCommit(const Slice&) override { return Status::OK(); }
|
|
136
|
+
|
|
137
|
+
Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
|
|
138
|
+
return Status::OK();
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
Status MarkRollback(const Slice&) override { return Status::OK(); }
|
|
142
|
+
|
|
143
|
+
Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
|
|
144
|
+
|
|
145
|
+
std::unique_ptr<WriteBatch>&& TransferNewBatch() {
|
|
146
|
+
assert(new_batch_diff_from_orig_batch_);
|
|
147
|
+
handler_valid_ = false;
|
|
148
|
+
return std::move(new_batch_);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
private:
|
|
152
|
+
Status ReconcileTimestampDiscrepancy(uint32_t cf, const Slice& key,
|
|
153
|
+
std::string* new_key_buf,
|
|
154
|
+
Slice* new_key);
|
|
155
|
+
|
|
156
|
+
// Mapping from column family id to user-defined timestamp size for all
|
|
157
|
+
// running column families including the ones with zero timestamp size.
|
|
158
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz_;
|
|
159
|
+
|
|
160
|
+
// Mapping from column family id to user-defined timestamp size as recorded
|
|
161
|
+
// in the WAL. This only contains non-zero user-defined timestamp size.
|
|
162
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz_;
|
|
163
|
+
|
|
164
|
+
std::unique_ptr<WriteBatch> new_batch_;
|
|
165
|
+
// Handler is valid upon creation and becomes invalid after its `new_batch_`
|
|
166
|
+
// is transferred.
|
|
167
|
+
bool handler_valid_;
|
|
168
|
+
|
|
169
|
+
// False upon creation, and become true if at least one user key from the
|
|
170
|
+
// original batch is updated when creating the new batch.
|
|
171
|
+
bool new_batch_diff_from_orig_batch_;
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
// Mode for checking and handling timestamp size inconsistency encountered in a
|
|
175
|
+
// WriteBatch read from WAL log.
|
|
176
|
+
enum class TimestampSizeConsistencyMode {
|
|
177
|
+
// Verified that the recorded user-defined timestamp size is consistent with
|
|
178
|
+
// the running one for all the column families involved in a WriteBatch.
|
|
179
|
+
// Column families referred to in the WriteBatch but are dropped are ignored.
|
|
180
|
+
kVerifyConsistency,
|
|
181
|
+
// Verified that if any inconsistency exists in a WriteBatch, it's all
|
|
182
|
+
// tolerable by a best-effort reconciliation. And optionally creates a new
|
|
183
|
+
// WriteBatch from the original WriteBatch that is consistent with the running
|
|
184
|
+
// timestamp size. Column families referred to in the WriteBatch but are
|
|
185
|
+
// dropped are ignored. If a new WriteBatch is created, such entries are
|
|
186
|
+
// copied over as is.
|
|
187
|
+
kReconcileInconsistency,
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
// Handles the inconsistency between recorded timestamp sizes and running
|
|
191
|
+
// timestamp sizes for a WriteBatch. A non-OK `status` indicates there are
|
|
192
|
+
// intolerable inconsistency with the specified `check_mode`.
|
|
193
|
+
//
|
|
194
|
+
// If `check_mode` is `kVerifyConsistency`, intolerable inconsistency means any
|
|
195
|
+
// running column family has an inconsistent user-defined timestamp size.
|
|
196
|
+
//
|
|
197
|
+
// If `check_mode` is `kReconcileInconsistency`, intolerable inconsistency means
|
|
198
|
+
// any running column family has an inconsistent user-defined timestamp size
|
|
199
|
+
// that cannot be reconciled with a best-effort recovery. Check
|
|
200
|
+
// `TimestampRecoveryHandler` for what a best-effort recovery is capable of. In
|
|
201
|
+
// this mode, output argument `new_batch` should be set, a new WriteBatch is
|
|
202
|
+
// created on the heap and transferred to `new_batch` if there is tolerable
|
|
203
|
+
// inconsistency.
|
|
204
|
+
//
|
|
205
|
+
// An invariant that WAL logging ensures is that all timestamp size info
|
|
206
|
+
// is logged prior to a WriteBatch that needed this info. And zero timestamp
|
|
207
|
+
// size is skipped. So `record_ts_sz` only contains column family with non-zero
|
|
208
|
+
// timestamp size and a column family id absent from `record_ts_sz` will be
|
|
209
|
+
// interpreted as that column family has zero timestamp size. On the other hand,
|
|
210
|
+
// `running_ts_sz` should contain the timestamp size for all running column
|
|
211
|
+
// families including the ones with zero timestamp size.
|
|
212
|
+
Status HandleWriteBatchTimestampSizeDifference(
|
|
213
|
+
const WriteBatch* batch,
|
|
214
|
+
const UnorderedMap<uint32_t, size_t>& running_ts_sz,
|
|
215
|
+
const UnorderedMap<uint32_t, size_t>& record_ts_sz,
|
|
216
|
+
TimestampSizeConsistencyMode check_mode,
|
|
217
|
+
std::unique_ptr<WriteBatch>* new_batch = nullptr);
|
|
218
|
+
|
|
219
|
+
// This util function is used when opening an existing column family and
|
|
220
|
+
// processing its VersionEdit. It does a sanity check for the column family's
|
|
221
|
+
// old user comparator and the persist_user_defined_timestamps flag as recorded
|
|
222
|
+
// in the VersionEdit, against its new settings from the column family's
|
|
223
|
+
// ImmutableCFOptions.
|
|
224
|
+
//
|
|
225
|
+
// Valid settings change include:
|
|
226
|
+
// 1) no user comparator change and no effective persist_user_defined_timestamp
|
|
227
|
+
// flag change.
|
|
228
|
+
// 2) switch user comparator to enable user-defined timestamps feature provided
|
|
229
|
+
// the immediately effective persist_user_defined_timestamps flag is false.
|
|
230
|
+
// 3) switch user comparator to disable user-defined timestamps feature provided
|
|
231
|
+
// that the before-change persist_user_defined_timestamps is already false.
|
|
232
|
+
//
|
|
233
|
+
// Switch user comparator to disable/enable UDT is only sanity checked by a user
|
|
234
|
+
// comparator name comparison. The full check includes enforcing the new user
|
|
235
|
+
// comparator ranks user keys exactly the same as the old user comparator and
|
|
236
|
+
// only add / remove the user-defined timestamp comparison. We don't have ways
|
|
237
|
+
// to strictly enforce this so currently only the RocksDB builtin comparator
|
|
238
|
+
// wrapper `ComparatorWithU64TsImpl` is supported to enable / disable
|
|
239
|
+
// user-defined timestamps. It formats user-defined timestamps as uint64_t.
|
|
240
|
+
//
|
|
241
|
+
// When the settings indicate a legit change to enable user-defined timestamps
|
|
242
|
+
// feature on a column family, `mark_sst_files_has_no_udt` will be set to true
|
|
243
|
+
// to indicate marking all existing SST files has no user-defined timestamps
|
|
244
|
+
// when re-writing the manifest.
|
|
245
|
+
Status ValidateUserDefinedTimestampsOptions(
|
|
246
|
+
const Comparator* new_comparator, const std::string& old_comparator_name,
|
|
247
|
+
bool new_persist_udt, bool old_persist_udt,
|
|
248
|
+
bool* mark_sst_files_has_no_udt);
|
|
77
249
|
} // namespace ROCKSDB_NAMESPACE
|