@nxtedition/rocksdb 5.2.21 → 5.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +510 -967
- package/binding.gyp +78 -72
- package/chained-batch.js +1 -2
- package/deps/rocksdb/build_version.cc +70 -4
- package/deps/rocksdb/rocksdb/CMakeLists.txt +281 -149
- package/deps/rocksdb/rocksdb/Makefile +459 -469
- package/deps/rocksdb/rocksdb/TARGETS +5244 -1500
- package/deps/rocksdb/rocksdb/cache/cache.cc +12 -3
- package/deps/rocksdb/rocksdb/cache/cache_bench.cc +7 -368
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +924 -0
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +128 -0
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.h +103 -0
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +183 -0
- package/deps/rocksdb/rocksdb/cache/cache_helpers.h +11 -0
- package/deps/rocksdb/rocksdb/cache/cache_key.cc +344 -0
- package/deps/rocksdb/rocksdb/cache/cache_key.h +132 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +183 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +288 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +468 -0
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +85 -8
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +121 -51
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +171 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +86 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +607 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +381 -154
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +176 -33
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1659 -3
- package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +94 -23
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +49 -28
- package/deps/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake +7 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindJeMalloc.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindNUMA.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/FindTBB.cmake +33 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Findgflags.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Findlz4.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Finduring.cmake +26 -0
- package/deps/rocksdb/rocksdb/cmake/modules/Findzstd.cmake +29 -0
- package/deps/rocksdb/rocksdb/cmake/modules/ReadVersion.cmake +10 -0
- package/deps/rocksdb/rocksdb/crash_test.mk +93 -0
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +54 -31
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +10 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +146 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc +326 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.cc +34 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_fetcher.h +37 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_addition.cc +4 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc +8 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +99 -40
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +20 -8
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +95 -83
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +13 -10
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +7 -4
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +37 -37
- package/deps/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h +101 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.cc +8 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_meta.h +6 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +209 -44
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +37 -11
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +382 -179
- package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc +100 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter.h +102 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc +196 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_index.h +3 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_log_format.h +2 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +7 -5
- package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h +10 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +12 -8
- package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.h +5 -5
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +772 -9
- package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +730 -0
- package/deps/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc +82 -0
- package/deps/rocksdb/rocksdb/db/blob/db_blob_index_test.cc +155 -17
- package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc +21 -0
- package/deps/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h +38 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +137 -89
- package/deps/rocksdb/rocksdb/db/builder.h +16 -37
- package/deps/rocksdb/rocksdb/db/c.cc +413 -208
- package/deps/rocksdb/rocksdb/db/c_test.c +227 -138
- package/deps/rocksdb/rocksdb/db/column_family.cc +118 -103
- package/deps/rocksdb/rocksdb/db/column_family.h +86 -44
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +38 -24
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +81 -0
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +275 -0
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc +258 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +81 -28
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +43 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h +12 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +406 -215
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +147 -50
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +167 -61
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +1321 -156
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +197 -28
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +246 -43
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +65 -26
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +7 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +122 -9
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +18 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +536 -44
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +311 -30
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +849 -0
- package/deps/rocksdb/rocksdb/db/compaction/file_pri.h +92 -0
- package/deps/rocksdb/rocksdb/db/compaction/sst_partitioner.cc +46 -0
- package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/convenience.cc +6 -3
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +383 -28
- package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +7 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +154 -45
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +1095 -33
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +1249 -203
- package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +135 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +1348 -166
- package/deps/rocksdb/rocksdb/db/db_dynamic_level_test.cc +3 -5
- package/deps/rocksdb/rocksdb/db/db_encryption_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +312 -45
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +1734 -48
- package/deps/rocksdb/rocksdb/db/{compacted_db_impl.cc → db_impl/compacted_db_impl.cc} +24 -7
- package/deps/rocksdb/rocksdb/db/{compacted_db_impl.h → db_impl/compacted_db_impl.h} +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +644 -333
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +365 -92
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +578 -210
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +38 -16
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +17 -10
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +75 -74
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +450 -183
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +42 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +232 -15
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +42 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +297 -100
- package/deps/rocksdb/rocksdb/db/db_info_dumper.cc +16 -15
- package/deps/rocksdb/rocksdb/db/db_inplace_update_test.cc +31 -1
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +6 -5
- package/deps/rocksdb/rocksdb/db/db_iter.cc +218 -153
- package/deps/rocksdb/rocksdb/db/db_iter.h +14 -12
- package/deps/rocksdb/rocksdb/db/db_iter_stress_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_iter_test.cc +84 -160
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +47 -6
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +204 -0
- package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +21 -13
- package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +17 -10
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +38 -24
- package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +184 -19
- package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +183 -3
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +409 -9
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +92 -23
- package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +446 -0
- package/deps/rocksdb/rocksdb/db/{db_impl/db_secondary_test.cc → db_secondary_test.cc} +363 -35
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +520 -15
- package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +50 -1
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +139 -4
- package/deps/rocksdb/rocksdb/db/db_tailing_iter_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_test.cc +669 -359
- package/deps/rocksdb/rocksdb/db/db_test2.cc +2110 -304
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +76 -43
- package/deps/rocksdb/rocksdb/db/db_test_util.h +231 -103
- package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +19 -11
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +490 -71
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +980 -349
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +11 -12
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +793 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/dbformat.cc +4 -12
- package/deps/rocksdb/rocksdb/db/dbformat.h +28 -18
- package/deps/rocksdb/rocksdb/db/dbformat_test.cc +3 -0
- package/deps/rocksdb/rocksdb/db/deletefile_test.cc +50 -15
- package/deps/rocksdb/rocksdb/db/error_handler.cc +127 -41
- package/deps/rocksdb/rocksdb/db/error_handler.h +12 -5
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +524 -255
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +136 -11
- package/deps/rocksdb/rocksdb/db/event_helpers.h +27 -2
- package/deps/rocksdb/rocksdb/db/experimental.cc +100 -0
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +307 -4
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +137 -60
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +12 -8
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +86 -55
- package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +86 -5
- package/deps/rocksdb/rocksdb/db/filename_test.cc +63 -0
- package/deps/rocksdb/rocksdb/db/flush_job.cc +619 -64
- package/deps/rocksdb/rocksdb/db/flush_job.h +30 -7
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +33 -16
- package/deps/rocksdb/rocksdb/db/flush_scheduler.h +2 -1
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +18 -17
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +5 -4
- package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +0 -1
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +91 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +25 -14
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +6 -5
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +471 -50
- package/deps/rocksdb/rocksdb/db/internal_stats.h +129 -25
- package/deps/rocksdb/rocksdb/db/job_context.h +22 -9
- package/deps/rocksdb/rocksdb/db/kv_checksum.h +394 -0
- package/deps/rocksdb/rocksdb/db/listener_test.cc +518 -41
- package/deps/rocksdb/rocksdb/db/log_format.h +4 -1
- package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -6
- package/deps/rocksdb/rocksdb/db/log_reader.h +17 -1
- package/deps/rocksdb/rocksdb/db/log_test.cc +161 -11
- package/deps/rocksdb/rocksdb/db/log_writer.cc +92 -13
- package/deps/rocksdb/rocksdb/db/log_writer.h +18 -5
- package/deps/rocksdb/rocksdb/db/logs_with_prep_tracker.h +1 -1
- package/deps/rocksdb/rocksdb/db/lookup_key.h +0 -1
- package/deps/rocksdb/rocksdb/db/malloc_stats.cc +2 -2
- package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +21 -8
- package/deps/rocksdb/rocksdb/db/memtable.cc +144 -54
- package/deps/rocksdb/rocksdb/db/memtable.h +72 -15
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +95 -47
- package/deps/rocksdb/rocksdb/db/memtable_list.h +33 -13
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +61 -31
- package/deps/rocksdb/rocksdb/db/merge_context.h +20 -8
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +54 -11
- package/deps/rocksdb/rocksdb/db/merge_helper.h +17 -6
- package/deps/rocksdb/rocksdb/db/merge_helper_test.cc +13 -7
- package/deps/rocksdb/rocksdb/db/merge_test.cc +40 -19
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +14 -25
- package/deps/rocksdb/rocksdb/db/output_validator.cc +3 -0
- package/deps/rocksdb/rocksdb/db/output_validator.h +5 -4
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +32 -28
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +43 -29
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +9 -7
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +21 -16
- package/deps/rocksdb/rocksdb/db/pinned_iterators_manager.h +1 -1
- package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +29 -36
- package/deps/rocksdb/rocksdb/db/pre_release_callback.h +1 -2
- package/deps/rocksdb/rocksdb/db/prefix_test.cc +4 -4
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +2 -2
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_bench.cc +11 -11
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +3 -2
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +14 -8
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +17 -0
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc +4 -2
- package/deps/rocksdb/rocksdb/db/read_callback.h +1 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +87 -58
- package/deps/rocksdb/rocksdb/db/repair_test.cc +35 -5
- package/deps/rocksdb/rocksdb/db/snapshot_impl.h +2 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +95 -69
- package/deps/rocksdb/rocksdb/db/table_cache.h +63 -53
- package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +4 -4
- package/deps/rocksdb/rocksdb/db/table_properties_collector.h +78 -10
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +28 -33
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +30 -51
- package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +12 -8
- package/deps/rocksdb/rocksdb/db/version_builder.cc +564 -341
- package/deps/rocksdb/rocksdb/db/version_builder.h +8 -8
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +327 -155
- package/deps/rocksdb/rocksdb/db/version_edit.cc +89 -27
- package/deps/rocksdb/rocksdb/db/version_edit.h +42 -17
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +324 -43
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +79 -22
- package/deps/rocksdb/rocksdb/db/version_edit_test.cc +165 -20
- package/deps/rocksdb/rocksdb/db/version_set.cc +935 -1034
- package/deps/rocksdb/rocksdb/db/version_set.h +183 -122
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +556 -138
- package/deps/rocksdb/rocksdb/db/version_util.h +68 -0
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +23 -21
- package/deps/rocksdb/rocksdb/db/wal_manager.h +5 -2
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +30 -27
- package/deps/rocksdb/rocksdb/db/write_batch.cc +704 -209
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +135 -2
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +209 -5
- package/deps/rocksdb/rocksdb/db/write_callback_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/write_controller.cc +47 -54
- package/deps/rocksdb/rocksdb/db/write_controller.h +12 -9
- package/deps/rocksdb/rocksdb/db/write_controller_test.cc +215 -103
- package/deps/rocksdb/rocksdb/db/write_thread.cc +11 -0
- package/deps/rocksdb/rocksdb/db/write_thread.h +14 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +10 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +78 -25
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +13 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +29 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +199 -32
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc +188 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +59 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +77 -109
- package/deps/rocksdb/rocksdb/{third-party/folly/folly/synchronization/WaitOptions.cpp → db_stress_tool/db_stress_stat.cc} +9 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +7 -6
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +699 -143
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +20 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +49 -39
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +631 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +287 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +1565 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +374 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +149 -18
- package/deps/rocksdb/rocksdb/env/composite_env.cc +464 -0
- package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +98 -646
- package/deps/rocksdb/rocksdb/env/emulated_clock.h +114 -0
- package/deps/rocksdb/rocksdb/env/env.cc +632 -42
- package/deps/rocksdb/rocksdb/env/env_basic_test.cc +84 -36
- package/deps/rocksdb/rocksdb/env/env_chroot.cc +88 -286
- package/deps/rocksdb/rocksdb/env/env_chroot.h +34 -1
- package/deps/rocksdb/rocksdb/env/env_encryption.cc +469 -277
- package/deps/rocksdb/rocksdb/env/env_encryption_ctr.h +9 -30
- package/deps/rocksdb/rocksdb/env/env_posix.cc +110 -119
- package/deps/rocksdb/rocksdb/env/env_test.cc +1128 -39
- package/deps/rocksdb/rocksdb/env/file_system.cc +147 -8
- package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +207 -136
- package/deps/rocksdb/rocksdb/env/file_system_tracer.h +86 -54
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +192 -64
- package/deps/rocksdb/rocksdb/env/fs_readonly.h +107 -0
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +339 -0
- package/deps/rocksdb/rocksdb/env/fs_remap.h +139 -0
- package/deps/rocksdb/rocksdb/env/io_posix.cc +245 -41
- package/deps/rocksdb/rocksdb/env/io_posix.h +66 -1
- package/deps/rocksdb/rocksdb/env/mock_env.cc +147 -149
- package/deps/rocksdb/rocksdb/env/mock_env.h +113 -11
- package/deps/rocksdb/rocksdb/env/mock_env_test.cc +2 -4
- package/deps/rocksdb/rocksdb/env/unique_id_gen.cc +164 -0
- package/deps/rocksdb/rocksdb/env/unique_id_gen.h +71 -0
- package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +9 -5
- package/deps/rocksdb/rocksdb/file/delete_scheduler.h +6 -4
- package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +19 -12
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +459 -70
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +205 -28
- package/deps/rocksdb/rocksdb/file/file_util.cc +39 -28
- package/deps/rocksdb/rocksdb/file/file_util.h +18 -27
- package/deps/rocksdb/rocksdb/file/filename.cc +59 -22
- package/deps/rocksdb/rocksdb/file/filename.h +13 -8
- package/deps/rocksdb/rocksdb/file/line_file_reader.cc +68 -0
- package/deps/rocksdb/rocksdb/file/line_file_reader.h +59 -0
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +1130 -6
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +220 -36
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +69 -17
- package/deps/rocksdb/rocksdb/file/random_access_file_reader_test.cc +13 -12
- package/deps/rocksdb/rocksdb/file/read_write_util.cc +3 -38
- package/deps/rocksdb/rocksdb/file/read_write_util.h +0 -4
- package/deps/rocksdb/rocksdb/file/readahead_file_info.h +33 -0
- package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +57 -9
- package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +58 -6
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +29 -54
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +22 -29
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +424 -50
- package/deps/rocksdb/rocksdb/file/writable_file_writer.h +66 -19
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +157 -66
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +224 -121
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +333 -30
- package/deps/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/cleanable.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +90 -50
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +13 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +20 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h +8 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/configurable.h +53 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +31 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/customizable.h +102 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +51 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +370 -262
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +286 -87
- package/deps/rocksdb/rocksdb/include/rocksdb/env_encryption.h +124 -64
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +27 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +21 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +384 -41
- package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +111 -143
- package/deps/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h +20 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +56 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +15 -33
- package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +37 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +314 -26
- package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +11 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +50 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +10 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +186 -96
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +373 -103
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +13 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +37 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +87 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +5 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +59 -30
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h +11 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +22 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h +17 -10
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +121 -41
- package/deps/rocksdb/rocksdb/include/rocksdb/stats_history.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +114 -136
- package/deps/rocksdb/rocksdb/include/rocksdb/system_clock.h +116 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +160 -18
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +57 -15
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h +10 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +247 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/trace_record_result.h +187 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/transaction_log.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +14 -24
- package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +46 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +14 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/agg_merge.h +138 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +631 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +142 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h +12 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h +368 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +24 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h +418 -63
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h +143 -73
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h +87 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +43 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +18 -23
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +26 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +32 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +30 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/wal_filter.h +11 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +89 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +11 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +108 -38
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +40 -23
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.h +12 -5
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +100 -49
- package/deps/rocksdb/rocksdb/logging/env_logger.h +7 -5
- package/deps/rocksdb/rocksdb/logging/env_logger_test.cc +0 -1
- package/deps/rocksdb/rocksdb/logging/posix_logger.h +3 -9
- package/deps/rocksdb/rocksdb/memory/arena.cc +3 -1
- package/deps/rocksdb/rocksdb/memory/arena.h +1 -1
- package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +171 -106
- package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +31 -15
- package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc +15 -4
- package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator.h +24 -8
- package/deps/rocksdb/rocksdb/memory/memory_allocator.cc +91 -0
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +239 -0
- package/deps/rocksdb/rocksdb/memory/memory_usage.h +14 -1
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +72 -9
- package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc +52 -6
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +53 -0
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +5 -5
- package/deps/rocksdb/rocksdb/memtable/memtablerep_bench.cc +17 -5
- package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -1
- package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +87 -0
- package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +20 -10
- package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +148 -94
- package/deps/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc +160 -62
- package/deps/rocksdb/rocksdb/microbench/CMakeLists.txt +17 -0
- package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +1360 -0
- package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +153 -0
- package/deps/rocksdb/rocksdb/monitoring/histogram.cc +8 -15
- package/deps/rocksdb/rocksdb/monitoring/histogram.h +0 -1
- package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +18 -16
- package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.cc +9 -7
- package/deps/rocksdb/rocksdb/monitoring/histogram_windowing.h +5 -3
- package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +7 -5
- package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +37 -12
- package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +26 -6
- package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +6 -10
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +14 -13
- package/deps/rocksdb/rocksdb/monitoring/perf_context_imp.h +19 -20
- package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +18 -18
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +84 -2
- package/deps/rocksdb/rocksdb/monitoring/statistics.h +6 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics_test.cc +47 -2
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +67 -54
- package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +4 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +2 -1
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +2 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +280 -212
- package/deps/rocksdb/rocksdb/options/cf_options.h +51 -57
- package/deps/rocksdb/rocksdb/options/configurable.cc +242 -138
- package/deps/rocksdb/rocksdb/options/configurable_helper.h +4 -68
- package/deps/rocksdb/rocksdb/options/configurable_test.cc +144 -21
- package/deps/rocksdb/rocksdb/options/configurable_test.h +2 -3
- package/deps/rocksdb/rocksdb/options/customizable.cc +67 -7
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +1773 -151
- package/deps/rocksdb/rocksdb/options/db_options.cc +275 -47
- package/deps/rocksdb/rocksdb/options/db_options.h +36 -7
- package/deps/rocksdb/rocksdb/options/options.cc +49 -17
- package/deps/rocksdb/rocksdb/options/options_helper.cc +369 -352
- package/deps/rocksdb/rocksdb/options/options_helper.h +23 -23
- package/deps/rocksdb/rocksdb/options/options_parser.cc +18 -13
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +67 -54
- package/deps/rocksdb/rocksdb/options/options_test.cc +1162 -187
- package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -1
- package/deps/rocksdb/rocksdb/port/lang.h +52 -0
- package/deps/rocksdb/rocksdb/port/port_example.h +1 -1
- package/deps/rocksdb/rocksdb/port/port_posix.cc +31 -2
- package/deps/rocksdb/rocksdb/port/port_posix.h +20 -2
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +20 -4
- package/deps/rocksdb/rocksdb/port/sys_time.h +2 -2
- package/deps/rocksdb/rocksdb/port/win/env_default.cc +7 -7
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +44 -74
- package/deps/rocksdb/rocksdb/port/win/env_win.h +25 -23
- package/deps/rocksdb/rocksdb/port/win/io_win.cc +32 -34
- package/deps/rocksdb/rocksdb/port/win/io_win.h +12 -6
- package/deps/rocksdb/rocksdb/port/win/port_win.cc +55 -35
- package/deps/rocksdb/rocksdb/port/win/port_win.h +22 -5
- package/deps/rocksdb/rocksdb/port/win/win_logger.cc +3 -3
- package/deps/rocksdb/rocksdb/port/win/win_logger.h +3 -5
- package/deps/rocksdb/rocksdb/port/win/win_thread.cc +7 -1
- package/deps/rocksdb/rocksdb/port/win/win_thread.h +12 -17
- package/deps/rocksdb/rocksdb/python.mk +9 -0
- package/deps/rocksdb/rocksdb/src.mk +82 -34
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -4
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block.cc +158 -80
- package/deps/rocksdb/rocksdb/table/block_based/block.h +64 -36
- package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc +23 -14
- package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.h +13 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc +3 -218
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +603 -328
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +28 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +220 -82
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +8 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +28 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +598 -492
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +151 -96
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +31 -58
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +330 -92
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +50 -19
- package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +23 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +226 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +56 -22
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +42 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_type.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +34 -20
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +9 -10
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +26 -3
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +844 -202
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +281 -81
- package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +62 -2
- package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.h +2 -3
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +28 -7
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +22 -6
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +28 -26
- package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +11 -4
- package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h +2 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +68 -26
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +44 -9
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +12 -10
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h +23 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +44 -19
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +5 -1
- package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +16 -28
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +7 -4
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +2 -2
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +77 -57
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +23 -12
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +43 -56
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc +8 -8
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h +2 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +52 -70
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc +5 -8
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h +1 -1
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +17 -11
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +2 -3
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +42 -51
- package/deps/rocksdb/rocksdb/table/format.cc +258 -104
- package/deps/rocksdb/rocksdb/table/format.h +120 -109
- package/deps/rocksdb/rocksdb/table/get_context.cc +97 -65
- package/deps/rocksdb/rocksdb/table/get_context.h +19 -12
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +14 -0
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/table/merger_test.cc +3 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +11 -21
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +3 -3
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +176 -171
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +47 -33
- package/deps/rocksdb/rocksdb/table/mock_table.cc +7 -9
- package/deps/rocksdb/rocksdb/table/mock_table.h +3 -2
- package/deps/rocksdb/rocksdb/table/multiget_context.h +15 -8
- package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +22 -29
- package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +6 -3
- package/deps/rocksdb/rocksdb/table/plain/plain_table_bloom.h +5 -8
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +29 -26
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +12 -16
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.cc +145 -69
- package/deps/rocksdb/rocksdb/table/plain/plain_table_factory.h +1 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.cc +7 -6
- package/deps/rocksdb/rocksdb/table/plain/plain_table_index.h +3 -4
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc +3 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_key_coding.h +1 -1
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +13 -18
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -9
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +55 -37
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +10 -5
- package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +11 -8
- package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +222 -16
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +106 -58
- package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +6 -5
- package/deps/rocksdb/rocksdb/table/table_builder.h +68 -44
- package/deps/rocksdb/rocksdb/table/table_factory.cc +37 -10
- package/deps/rocksdb/rocksdb/table/table_properties.cc +109 -54
- package/deps/rocksdb/rocksdb/table/table_properties_internal.h +4 -20
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +33 -32
- package/deps/rocksdb/rocksdb/table/table_reader_caller.h +2 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +989 -326
- package/deps/rocksdb/rocksdb/table/two_level_iterator.cc +4 -0
- package/deps/rocksdb/rocksdb/table/unique_id.cc +166 -0
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +59 -0
- package/deps/rocksdb/rocksdb/test_util/mock_time_env.cc +1 -1
- package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +13 -10
- package/deps/rocksdb/rocksdb/test_util/sync_point.cc +1 -2
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +35 -16
- package/deps/rocksdb/rocksdb/test_util/sync_point_impl.cc +32 -10
- package/deps/rocksdb/rocksdb/test_util/sync_point_impl.h +31 -4
- package/deps/rocksdb/rocksdb/test_util/testharness.cc +53 -1
- package/deps/rocksdb/rocksdb/test_util/testharness.h +67 -3
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +236 -66
- package/deps/rocksdb/rocksdb/test_util/testutil.h +63 -100
- package/deps/rocksdb/rocksdb/test_util/transaction_test_util.cc +12 -1
- package/deps/rocksdb/rocksdb/tools/blob_dump.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +6 -3
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h +1 -0
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +9 -3
- package/deps/rocksdb/rocksdb/tools/db_bench.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +1420 -611
- package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +11 -8
- package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +11 -1
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +4 -2
- package/deps/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc +46 -22
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +655 -179
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +58 -6
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +472 -29
- package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +23 -2
- package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +2 -2
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc +246 -0
- package/deps/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h +126 -0
- package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +83 -29
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +38 -17
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +191 -55
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +219 -296
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.h +87 -53
- package/deps/rocksdb/rocksdb/tools/write_stress.cc +8 -7
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc +6 -5
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +5 -4
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc +14 -9
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.cc +134 -60
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer.h +49 -38
- package/deps/rocksdb/rocksdb/trace_replay/io_tracer_test.cc +152 -15
- package/deps/rocksdb/rocksdb/trace_replay/trace_record.cc +206 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.cc +190 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_record_handler.h +46 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_record_result.cc +146 -0
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +475 -344
- package/deps/rocksdb/rocksdb/trace_replay/trace_replay.h +83 -95
- package/deps/rocksdb/rocksdb/util/autovector.h +38 -18
- package/deps/rocksdb/rocksdb/util/autovector_test.cc +1 -1
- package/deps/rocksdb/rocksdb/util/bloom_impl.h +4 -0
- package/deps/rocksdb/rocksdb/util/bloom_test.cc +276 -94
- package/deps/rocksdb/rocksdb/util/build_version.cc.in +81 -4
- package/deps/rocksdb/rocksdb/util/cast_util.h +22 -0
- package/deps/rocksdb/rocksdb/util/channel.h +2 -0
- package/deps/rocksdb/rocksdb/util/coding.h +1 -33
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +8 -0
- package/deps/rocksdb/rocksdb/util/comparator.cc +163 -3
- package/deps/rocksdb/rocksdb/util/compression.cc +122 -0
- package/deps/rocksdb/rocksdb/util/compression.h +212 -7
- package/deps/rocksdb/rocksdb/util/compression_context_cache.cc +1 -3
- package/deps/rocksdb/rocksdb/util/crc32c.cc +165 -2
- package/deps/rocksdb/rocksdb/util/crc32c.h +6 -0
- package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +14 -0
- package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +3 -0
- package/deps/rocksdb/rocksdb/util/crc32c_test.cc +47 -0
- package/deps/rocksdb/rocksdb/util/defer.h +30 -1
- package/deps/rocksdb/rocksdb/util/defer_test.cc +11 -0
- package/deps/rocksdb/rocksdb/util/duplicate_detector.h +3 -1
- package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +3 -3
- package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +5 -4
- package/deps/rocksdb/rocksdb/util/fastrange.h +2 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +36 -0
- package/deps/rocksdb/rocksdb/util/file_checksum_helper.h +3 -1
- package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +512 -52
- package/deps/rocksdb/rocksdb/util/filter_bench.cc +65 -10
- package/deps/rocksdb/rocksdb/util/gflags_compat.h +6 -1
- package/deps/rocksdb/rocksdb/util/hash.cc +121 -3
- package/deps/rocksdb/rocksdb/util/hash.h +31 -1
- package/deps/rocksdb/rocksdb/util/hash128.h +26 -0
- package/deps/rocksdb/rocksdb/util/hash_containers.h +51 -0
- package/deps/rocksdb/rocksdb/util/hash_test.cc +194 -2
- package/deps/rocksdb/rocksdb/util/heap.h +6 -1
- package/deps/rocksdb/rocksdb/util/kv_map.h +1 -1
- package/deps/rocksdb/rocksdb/util/log_write_bench.cc +8 -6
- package/deps/rocksdb/rocksdb/util/math.h +74 -7
- package/deps/rocksdb/rocksdb/util/math128.h +13 -1
- package/deps/rocksdb/rocksdb/util/murmurhash.h +3 -3
- package/deps/rocksdb/rocksdb/util/random.cc +9 -0
- package/deps/rocksdb/rocksdb/util/random.h +6 -0
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +298 -144
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +68 -19
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +335 -23
- package/deps/rocksdb/rocksdb/util/repeatable_thread.h +10 -12
- package/deps/rocksdb/rocksdb/util/repeatable_thread_test.cc +18 -15
- package/deps/rocksdb/rocksdb/util/ribbon_alg.h +98 -74
- package/deps/rocksdb/rocksdb/util/ribbon_config.cc +506 -0
- package/deps/rocksdb/rocksdb/util/ribbon_config.h +182 -0
- package/deps/rocksdb/rocksdb/util/ribbon_impl.h +154 -79
- package/deps/rocksdb/rocksdb/util/ribbon_test.cc +742 -365
- package/deps/rocksdb/rocksdb/util/set_comparator.h +2 -0
- package/deps/rocksdb/rocksdb/util/slice.cc +198 -35
- package/deps/rocksdb/rocksdb/util/slice_test.cc +30 -1
- package/deps/rocksdb/rocksdb/util/status.cc +32 -29
- package/deps/rocksdb/rocksdb/util/stop_watch.h +18 -18
- package/deps/rocksdb/rocksdb/util/string_util.cc +85 -6
- package/deps/rocksdb/rocksdb/util/string_util.h +47 -2
- package/deps/rocksdb/rocksdb/util/thread_guard.h +41 -0
- package/deps/rocksdb/rocksdb/util/thread_local.h +2 -2
- package/deps/rocksdb/rocksdb/util/thread_local_test.cc +22 -24
- package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +7 -6
- package/deps/rocksdb/rocksdb/util/timer.h +55 -46
- package/deps/rocksdb/rocksdb/util/timer_test.cc +50 -48
- package/deps/rocksdb/rocksdb/util/user_comparator_wrapper.h +4 -0
- package/deps/rocksdb/rocksdb/util/vector_iterator.h +31 -15
- package/deps/rocksdb/rocksdb/util/work_queue.h +2 -0
- package/deps/rocksdb/rocksdb/util/xxhash.cc +35 -1144
- package/deps/rocksdb/rocksdb/util/xxhash.h +5117 -373
- package/deps/rocksdb/rocksdb/util/xxph3.h +1762 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +238 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.h +49 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +134 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +104 -0
- package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.h +47 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +3164 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_impl.h +29 -0
- package/deps/rocksdb/rocksdb/utilities/{backupable/backupable_db_test.cc → backup/backup_engine_test.cc} +1679 -485
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +6 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +14 -9
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h +4 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +37 -27
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +8 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h +13 -10
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +5 -0
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +44 -25
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +3 -4
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +27 -19
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +4 -2
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load.cc +69 -0
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +489 -0
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +366 -0
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc +67 -4
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h +21 -6
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +107 -7
- package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h +43 -0
- package/deps/rocksdb/rocksdb/utilities/cassandra/format.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc +24 -8
- package/deps/rocksdb/rocksdb/utilities/cassandra/merge_operator.h +7 -7
- package/deps/rocksdb/rocksdb/utilities/cassandra/serialize.h +5 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +99 -218
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +8 -24
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +114 -1
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h +6 -2
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +0 -4
- package/deps/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +7 -6
- package/deps/rocksdb/rocksdb/utilities/compaction_filters.cc +56 -0
- package/deps/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +355 -0
- package/deps/rocksdb/rocksdb/utilities/counted_fs.h +152 -0
- package/deps/rocksdb/rocksdb/utilities/env_mirror.cc +13 -0
- package/deps/rocksdb/rocksdb/utilities/env_timed.cc +164 -122
- package/deps/rocksdb/rocksdb/utilities/env_timed.h +97 -0
- package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +75 -17
- package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +19 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +539 -126
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +162 -17
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +110 -0
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +94 -0
- package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +5 -2
- package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +104 -0
- package/deps/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h +5 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/max.cc +4 -1
- package/deps/rocksdb/rocksdb/utilities/merge_operators/put.cc +11 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc +0 -2
- package/deps/rocksdb/rocksdb/utilities/merge_operators/sortlist.h +5 -1
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc +29 -10
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc +29 -14
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +71 -18
- package/deps/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc +15 -9
- package/deps/rocksdb/rocksdb/utilities/merge_operators.cc +120 -0
- package/deps/rocksdb/rocksdb/utilities/merge_operators.h +3 -23
- package/deps/rocksdb/rocksdb/utilities/object_registry.cc +267 -42
- package/deps/rocksdb/rocksdb/utilities/object_registry_test.cc +702 -76
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +26 -5
- package/deps/rocksdb/rocksdb/utilities/options/options_util.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +124 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc +2 -3
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h +8 -9
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +15 -13
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h +4 -4
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc +8 -9
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h +6 -3
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc +3 -0
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc +2 -0
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +43 -35
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +20 -18
- package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +107 -2
- package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc +23 -15
- package/deps/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.cc +316 -0
- package/deps/rocksdb/rocksdb/utilities/trace/replayer_impl.h +86 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +4 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +4 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +119 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +20 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h +20 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h +3 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +4 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +38 -14
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h +17 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +1 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +1 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +423 -34
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +82 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +72 -40
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +32 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +13 -5
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +7 -3
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +207 -43
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +50 -7
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +28 -10
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +11 -6
- package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +516 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +506 -15
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +27 -13
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +14 -14
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +3 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +14 -5
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +305 -27
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +55 -159
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +209 -2
- package/deps/rocksdb/rocksdb/utilities/wal_filter.cc +23 -0
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +157 -88
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +501 -114
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +91 -316
- package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +1212 -672
- package/deps/rocksdb/rocksdb.gyp +425 -446
- package/index.js +5 -87
- package/package-lock.json +23687 -0
- package/package.json +8 -9
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/darwin-x64/node.napi.node +0 -0
- package/prebuilds/{darwin-x64+arm64 → linux-x64}/node.napi.node +0 -0
- package/deps/rocksdb/rocksdb/README.md +0 -32
- package/deps/rocksdb/rocksdb/env/env_hdfs.cc +0 -648
- package/deps/rocksdb/rocksdb/hdfs/README +0 -23
- package/deps/rocksdb/rocksdb/hdfs/env_hdfs.h +0 -386
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h +0 -535
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h +0 -175
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/utility_db.h +0 -34
- package/deps/rocksdb/rocksdb/memory/memkind_kmem_allocator_test.cc +0 -102
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.h +0 -49
- package/deps/rocksdb/rocksdb/memtable/hash_skiplist_rep.h +0 -44
- package/deps/rocksdb/rocksdb/options/customizable_helper.h +0 -216
- package/deps/rocksdb/rocksdb/port/README +0 -10
- package/deps/rocksdb/rocksdb/third-party/folly/folly/CPortability.h +0 -27
- package/deps/rocksdb/rocksdb/third-party/folly/folly/ConstexprMath.h +0 -45
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Indestructible.h +0 -166
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Optional.h +0 -570
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Portability.h +0 -92
- package/deps/rocksdb/rocksdb/third-party/folly/folly/ScopeGuard.h +0 -54
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Traits.h +0 -152
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Unit.h +0 -59
- package/deps/rocksdb/rocksdb/third-party/folly/folly/Utility.h +0 -141
- package/deps/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h +0 -33
- package/deps/rocksdb/rocksdb/third-party/folly/folly/container/Array.h +0 -74
- package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex-inl.h +0 -117
- package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp +0 -263
- package/deps/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.h +0 -96
- package/deps/rocksdb/rocksdb/third-party/folly/folly/functional/Invoke.h +0 -40
- package/deps/rocksdb/rocksdb/third-party/folly/folly/hash/Hash.h +0 -29
- package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h +0 -144
- package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Bits.h +0 -30
- package/deps/rocksdb/rocksdb/third-party/folly/folly/lang/Launder.h +0 -51
- package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/Asm.h +0 -28
- package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysSyscall.h +0 -10
- package/deps/rocksdb/rocksdb/third-party/folly/folly/portability/SysTypes.h +0 -26
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification-inl.h +0 -138
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.cpp +0 -23
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicNotification.h +0 -57
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil-inl.h +0 -260
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/AtomicUtil.h +0 -52
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h +0 -328
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h +0 -1703
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.cpp +0 -16
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex.h +0 -304
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h +0 -39
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.cpp +0 -26
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/ParkingLot.h +0 -318
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/WaitOptions.h +0 -57
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h +0 -219
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h +0 -207
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/ProxyLockable.h +0 -164
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Sleeper.h +0 -57
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/detail/Spin.h +0 -77
- package/deps/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp +0 -1145
- package/deps/rocksdb/rocksdb/util/build_version.h +0 -15
- package/deps/rocksdb/rocksdb/util/xxh3p.h +0 -1392
- package/deps/rocksdb/rocksdb/utilities/backupable/backupable_db.cc +0 -2354
- package/deps/rocksdb/rocksdb/utilities/env_librados.cc +0 -1497
- package/deps/rocksdb/rocksdb/utilities/env_librados_test.cc +0 -1146
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +0 -13
- package/deps/snappy/snappy-1.1.7/README.md +0 -149
- package/prebuilds/linux-x64/node.napi.glibc.node +0 -0
|
@@ -1,1703 +0,0 @@
|
|
|
1
|
-
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
2
|
-
// This source code is licensed under both the GPLv2 (found in the
|
|
3
|
-
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
|
-
// (found in the LICENSE.Apache file in the root directory).
|
|
5
|
-
|
|
6
|
-
#include <folly/synchronization/DistributedMutex.h>
|
|
7
|
-
|
|
8
|
-
#include <folly/ConstexprMath.h>
|
|
9
|
-
#include <folly/Portability.h>
|
|
10
|
-
#include <folly/ScopeGuard.h>
|
|
11
|
-
#include <folly/Utility.h>
|
|
12
|
-
#include <folly/chrono/Hardware.h>
|
|
13
|
-
#include <folly/detail/Futex.h>
|
|
14
|
-
#include <folly/lang/Align.h>
|
|
15
|
-
#include <folly/lang/Bits.h>
|
|
16
|
-
#include <folly/portability/Asm.h>
|
|
17
|
-
#include <folly/synchronization/AtomicNotification.h>
|
|
18
|
-
#include <folly/synchronization/AtomicUtil.h>
|
|
19
|
-
#include <folly/synchronization/detail/InlineFunctionRef.h>
|
|
20
|
-
#include <folly/synchronization/detail/Sleeper.h>
|
|
21
|
-
|
|
22
|
-
#include <array>
|
|
23
|
-
#include <atomic>
|
|
24
|
-
#include <cstddef>
|
|
25
|
-
#include <cstdint>
|
|
26
|
-
#include <limits>
|
|
27
|
-
#include <stdexcept>
|
|
28
|
-
#include <thread>
|
|
29
|
-
#include <utility>
|
|
30
|
-
|
|
31
|
-
namespace folly {
|
|
32
|
-
namespace detail {
|
|
33
|
-
namespace distributed_mutex {
|
|
34
|
-
// kUnlocked is used to show unlocked state
|
|
35
|
-
//
|
|
36
|
-
// When locking threads encounter kUnlocked in the underlying storage, they
|
|
37
|
-
// can just acquire the lock without any further effort
|
|
38
|
-
constexpr auto kUnlocked = std::uintptr_t{0b0};
|
|
39
|
-
// kLocked is used to show that the mutex is currently locked, and future
|
|
40
|
-
// attempts to lock the mutex should enqueue on the central storage
|
|
41
|
-
//
|
|
42
|
-
// Locking threads find this on central storage only when there is a
|
|
43
|
-
// contention chain that is undergoing wakeups, in every other case, a locker
|
|
44
|
-
// will either find kUnlocked or an arbitrary address with the kLocked bit set
|
|
45
|
-
constexpr auto kLocked = std::uintptr_t{0b1};
|
|
46
|
-
// kTimedWaiter is set when there is at least one timed waiter on the mutex
|
|
47
|
-
//
|
|
48
|
-
// Timed waiters do not follow the sleeping strategy employed by regular,
|
|
49
|
-
// non-timed threads. They sleep on the central mutex atomic through an
|
|
50
|
-
// extended futex() interface that allows sleeping with the same semantics for
|
|
51
|
-
// non-standard integer widths
|
|
52
|
-
//
|
|
53
|
-
// When a regular non-timed thread unlocks or enqueues on the mutex, and sees
|
|
54
|
-
// a timed waiter, it takes ownership of all the timed waiters. The thread
|
|
55
|
-
// that has taken ownership of the timed waiter releases the timed waiters
|
|
56
|
-
// when it gets a chance at the critical section. At which point it issues a
|
|
57
|
-
// wakeup to single timed waiter, timed waiters always issue wake() calls to
|
|
58
|
-
// other timed waiters
|
|
59
|
-
constexpr auto kTimedWaiter = std::uintptr_t{0b10};
|
|
60
|
-
|
|
61
|
-
// kUninitialized means that the thread has just enqueued, and has not yet
|
|
62
|
-
// gotten to initializing itself with the address of its successor
|
|
63
|
-
//
|
|
64
|
-
// this becomes significant for threads that are trying to wake up the
|
|
65
|
-
// uninitialized thread, if they see that the thread is not yet initialized,
|
|
66
|
-
// they can do nothing but spin, and wait for the thread to get initialized
|
|
67
|
-
//
|
|
68
|
-
// This also plays a role in the functioning of flat combining as implemented
|
|
69
|
-
// in DistributedMutex. When a thread owning the lock goes through the
|
|
70
|
-
// contention chain to either unlock the mutex or combine critical sections
|
|
71
|
-
// from the other end. The presence of kUninitialized means that the
|
|
72
|
-
// combining thread is not able to make progress after this point. So we
|
|
73
|
-
// transfer the lock.
|
|
74
|
-
constexpr auto kUninitialized = std::uint32_t{0b0};
|
|
75
|
-
// kWaiting will be set in the waiter's futex structs while they are spinning
|
|
76
|
-
// while waiting for the mutex
|
|
77
|
-
constexpr auto kWaiting = std::uint32_t{0b1};
|
|
78
|
-
// kWake will be set by threads that are waking up waiters that have enqueued
|
|
79
|
-
constexpr auto kWake = std::uint32_t{0b10};
|
|
80
|
-
// kSkipped will be set by a waker when they see that a waiter has been
|
|
81
|
-
// preempted away by the kernel, in this case the thread that got skipped will
|
|
82
|
-
// have to wake up and put itself back on the queue
|
|
83
|
-
constexpr auto kSkipped = std::uint32_t{0b11};
|
|
84
|
-
// kAboutToWait will be set by a waiter that enqueues itself with the purpose
|
|
85
|
-
// of waiting on a futex
|
|
86
|
-
constexpr auto kAboutToWait = std::uint32_t{0b100};
|
|
87
|
-
// kSleeping will be set by a waiter right before enqueueing on a futex. When
|
|
88
|
-
// a thread wants to wake up a waiter that has enqueued on a futex, it should
|
|
89
|
-
// set the futex to contain kWake
|
|
90
|
-
//
|
|
91
|
-
// a thread that is unlocking and wants to skip over a sleeping thread also
|
|
92
|
-
// calls futex_.exchange(kSleeping) on the sleeping thread's futex word. It
|
|
93
|
-
// does this to 1. detect whether the sleeping thread had actually gone to
|
|
94
|
-
// sleeping on the futex word so it can skip it, and 2. to synchronize with
|
|
95
|
-
// other non atomic writes in the sleeping thread's context (such as the write
|
|
96
|
-
// to track the next waiting thread).
|
|
97
|
-
//
|
|
98
|
-
// We reuse kSleeping instead of say using another constant kEarlyDelivery to
|
|
99
|
-
// avoid situations where a thread has to enter kernel mode due to calling
|
|
100
|
-
// futexWait() twice because of the presence of a waking thread. This
|
|
101
|
-
// situation can arise when an unlocking thread goes to skip over a sleeping
|
|
102
|
-
// thread, sees that the thread has slept and move on, but the sleeping thread
|
|
103
|
-
// had not yet entered futex(). This interleaving causes the thread calling
|
|
104
|
-
// futex() to return spuriously, as the futex word is not what it should be
|
|
105
|
-
constexpr auto kSleeping = std::uint32_t{0b101};
|
|
106
|
-
// kCombined is set by the lock holder to let the waiter thread know that its
|
|
107
|
-
// combine request was successfully completed by the lock holder. A
|
|
108
|
-
// successful combine means that the thread requesting the combine operation
|
|
109
|
-
// does not need to unlock the mutex; in fact, doing so would be an error.
|
|
110
|
-
constexpr auto kCombined = std::uint32_t{0b111};
|
|
111
|
-
// kCombineUninitialized is like kUninitialized but is set by a thread when it
|
|
112
|
-
// enqueues in hopes of getting its critical section combined with the lock
|
|
113
|
-
// holder
|
|
114
|
-
constexpr auto kCombineUninitialized = std::uint32_t{0b1000};
|
|
115
|
-
// kCombineWaiting is set by a thread when it is ready to have its combine
|
|
116
|
-
// record fulfilled by the lock holder. In particular, this signals to the
|
|
117
|
-
// lock holder that the thread has set its next_ pointer in the contention
|
|
118
|
-
// chain
|
|
119
|
-
constexpr auto kCombineWaiting = std::uint32_t{0b1001};
|
|
120
|
-
// kExceptionOccurred is set on the waiter futex when the remote task throws
|
|
121
|
-
// an exception. It is the caller's responsibility to retrieve the exception
|
|
122
|
-
// and rethrow it in their own context. Note that when the caller uses a
|
|
123
|
-
// noexcept function as their critical section, they can avoid checking for
|
|
124
|
-
// this value
|
|
125
|
-
//
|
|
126
|
-
// This allows us to avoid all cost of exceptions in the memory layout of the
|
|
127
|
-
// fast path (no errors) as exceptions are stored as an std::exception_ptr in
|
|
128
|
-
// the same union that stores the return value of the critical section. We
|
|
129
|
-
// also avoid all CPU overhead because the combiner uses a try-catch block
|
|
130
|
-
// without any additional branching to handle exceptions
|
|
131
|
-
constexpr auto kExceptionOccurred = std::uint32_t{0b1010};
|
|
132
|
-
|
|
133
|
-
// The number of spins that we are allowed to do before we resort to marking a
|
|
134
|
-
// thread as having slept
|
|
135
|
-
//
|
|
136
|
-
// This is just a magic number from benchmarks
|
|
137
|
-
constexpr auto kScheduledAwaySpinThreshold = std::chrono::nanoseconds{200};
|
|
138
|
-
// The maximum number of spins before a thread starts yielding its processor
|
|
139
|
-
// in hopes of getting skipped
|
|
140
|
-
constexpr auto kMaxSpins = 4000;
|
|
141
|
-
// The maximum number of contention chains we can resolve with flat combining.
|
|
142
|
-
// After this number of contention chains, the mutex falls back to regular
|
|
143
|
-
// two-phased mutual exclusion to ensure that we don't starve the combiner
|
|
144
|
-
// thread
|
|
145
|
-
constexpr auto kMaxCombineIterations = 2;
|
|
146
|
-
|
|
147
|
-
/**
|
|
148
|
-
* Write only data that is available to the thread that is waking up another.
|
|
149
|
-
* Only the waking thread is allowed to write to this, the thread to be woken
|
|
150
|
-
* is allowed to read from this after a wakeup has been issued
|
|
151
|
-
*/
|
|
152
|
-
template <template <typename> class Atomic>
|
|
153
|
-
class WakerMetadata {
|
|
154
|
-
public:
|
|
155
|
-
explicit WakerMetadata(
|
|
156
|
-
std::uintptr_t waker = 0,
|
|
157
|
-
std::uintptr_t waiters = 0,
|
|
158
|
-
std::uint32_t sleeper = kUninitialized)
|
|
159
|
-
: waker_{waker}, waiters_{waiters}, sleeper_{sleeper} {}
|
|
160
|
-
|
|
161
|
-
// This is the thread that initiated wakeups for the contention chain.
|
|
162
|
-
// There can only ever be one thread that initiates the wakeup for a
|
|
163
|
-
// chain in the spin only version of this mutex. When a thread that just
|
|
164
|
-
// woke up sees this as the next thread to wake up, it knows that it is the
|
|
165
|
-
// terminal node in the contention chain. This means that it was the one
|
|
166
|
-
// that took off the thread that had acquired the mutex off the centralized
|
|
167
|
-
// state. Therefore, the current thread is the last in its contention
|
|
168
|
-
// chain. It will fall back to centralized storage to pick up the next
|
|
169
|
-
// waiter or release the mutex
|
|
170
|
-
//
|
|
171
|
-
// When we move to a full sleeping implementation, this might need to change
|
|
172
|
-
// to a small_vector<> to account for failed wakeups, or we can put threads
|
|
173
|
-
// to sleep on the central futex, which is an easier implementation
|
|
174
|
-
// strategy. Although, since this is allocated on the stack, we can set a
|
|
175
|
-
// prohitively large threshold to avoid heap allocations, this strategy
|
|
176
|
-
// however, might cause increased cache misses on wakeup signalling
|
|
177
|
-
std::uintptr_t waker_{0};
|
|
178
|
-
// the list of threads that the waker had previously seen to be sleeping on
|
|
179
|
-
// a futex(),
|
|
180
|
-
//
|
|
181
|
-
// this is given to the current thread as a means to pass on
|
|
182
|
-
// information. When the current thread goes to unlock the mutex and does
|
|
183
|
-
// not see contention, it should go and wake up the head of this list. If
|
|
184
|
-
// the current thread sees a contention chain on the mutex, it should pass
|
|
185
|
-
// on this list to the next thread that gets woken up
|
|
186
|
-
std::uintptr_t waiters_{0};
|
|
187
|
-
// The futex that this waiter will sleep on
|
|
188
|
-
//
|
|
189
|
-
// how can we reuse futex_ from above for futex management?
|
|
190
|
-
Futex<Atomic> sleeper_{kUninitialized};
|
|
191
|
-
};
|
|
192
|
-
|
|
193
|
-
/**
|
|
194
|
-
* Type of the type-erased callable that is used for combining from the lock
|
|
195
|
-
* holder's end. This has 48 bytes of inline storage that can be used to
|
|
196
|
-
* minimize cache misses when combining
|
|
197
|
-
*/
|
|
198
|
-
using CombineFunction = detail::InlineFunctionRef<void(), 48>;
|
|
199
|
-
|
|
200
|
-
/**
|
|
201
|
-
* Waiter encapsulates the state required for waiting on the mutex, this
|
|
202
|
-
* contains potentially heavy state and is intended to be allocated on the
|
|
203
|
-
* stack as part of a lock() function call
|
|
204
|
-
*
|
|
205
|
-
* To ensure that synchronization does not cause unintended side effects on
|
|
206
|
-
* the rest of the thread stack (eg. metadata in lockImplementation(), or any
|
|
207
|
-
* other data in the user's thread), we aggresively pad this struct and use
|
|
208
|
-
* custom alignment internally to ensure that the relevant data fits within a
|
|
209
|
-
* single cacheline. The added alignment here also gives us some room to
|
|
210
|
-
* wiggle in the bottom few bits of the mutex, where we store extra metadata
|
|
211
|
-
*/
|
|
212
|
-
template <template <typename> class Atomic>
|
|
213
|
-
class Waiter {
|
|
214
|
-
public:
|
|
215
|
-
Waiter() {}
|
|
216
|
-
Waiter(Waiter&&) = delete;
|
|
217
|
-
Waiter(const Waiter&) = delete;
|
|
218
|
-
Waiter& operator=(Waiter&&) = delete;
|
|
219
|
-
Waiter& operator=(const Waiter&) = delete;
|
|
220
|
-
|
|
221
|
-
void initialize(std::uint64_t futex, CombineFunction task) {
|
|
222
|
-
// we only initialize the function if we were actually given a non-null
|
|
223
|
-
// task, otherwise
|
|
224
|
-
if (task) {
|
|
225
|
-
assert(futex == kCombineUninitialized);
|
|
226
|
-
new (&function_) CombineFunction(task);
|
|
227
|
-
} else {
|
|
228
|
-
assert((futex == kUninitialized) || (futex == kAboutToWait));
|
|
229
|
-
new (&metadata_) WakerMetadata<Atomic>{};
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
// this pedantic store is needed to ensure that the waking thread
|
|
233
|
-
// synchronizes with the state in the waiter struct when it loads the
|
|
234
|
-
// value of the futex word
|
|
235
|
-
//
|
|
236
|
-
// on x86, this gets optimized away to just a regular store, it might be
|
|
237
|
-
// needed on platforms where explicit acquire-release barriers are
|
|
238
|
-
// required for synchronization
|
|
239
|
-
//
|
|
240
|
-
// note that we release here at the end of the constructor because
|
|
241
|
-
// construction is complete here, any thread that acquires this release
|
|
242
|
-
// will see a well constructed wait node
|
|
243
|
-
futex_.store(futex, std::memory_order_release);
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
std::array<std::uint8_t, hardware_destructive_interference_size> padding1;
|
|
247
|
-
// the atomic that this thread will spin on while waiting for the mutex to
|
|
248
|
-
// be unlocked
|
|
249
|
-
alignas(hardware_destructive_interference_size) Atomic<std::uint64_t> futex_{
|
|
250
|
-
kUninitialized};
|
|
251
|
-
// The successor of this node. This will be the thread that had its address
|
|
252
|
-
// on the mutex previously
|
|
253
|
-
//
|
|
254
|
-
// We can do without making this atomic since the remote thread synchronizes
|
|
255
|
-
// on the futex variable above. If this were not atomic, the remote thread
|
|
256
|
-
// would only be allowed to read from it after the waiter has moved into the
|
|
257
|
-
// waiting state to avoid risk of a load racing with a write. However, it
|
|
258
|
-
// helps to make this atomic because we can use an unconditional load and make
|
|
259
|
-
// full use of the load buffer to coalesce both reads into a single clock
|
|
260
|
-
// cycle after the line arrives in the combiner core. This is a heavily
|
|
261
|
-
// contended line, so an RFO from the enqueueing thread is highly likely and
|
|
262
|
-
// has the potential to cause an immediate invalidation; blocking the combiner
|
|
263
|
-
// thread from making progress until the line is pulled back to read this
|
|
264
|
-
// value
|
|
265
|
-
//
|
|
266
|
-
// Further, making this atomic prevents the compiler from making an incorrect
|
|
267
|
-
// optimization where it does not load the value as written in the code, but
|
|
268
|
-
// rather dereferences it through a pointer whenever needed (since the value
|
|
269
|
-
// of the pointer to this is readily available on the stack). Doing this
|
|
270
|
-
// causes multiple invalidation requests from the enqueueing thread, blocking
|
|
271
|
-
// remote progress
|
|
272
|
-
//
|
|
273
|
-
// Note that we use relaxed loads and stores, so this should not have any
|
|
274
|
-
// additional overhead compared to a regular load on most architectures
|
|
275
|
-
std::atomic<std::uintptr_t> next_{0};
|
|
276
|
-
// We use an anonymous union for the combined critical section request and
|
|
277
|
-
// the metadata that will be filled in from the leader's end. Only one is
|
|
278
|
-
// active at a time - if a leader decides to combine the requested critical
|
|
279
|
-
// section into its execution, it will not touch the metadata field. If a
|
|
280
|
-
// leader decides to migrate the lock to the waiter, it will not touch the
|
|
281
|
-
// function
|
|
282
|
-
//
|
|
283
|
-
// this allows us to transfer more state when combining a critical section
|
|
284
|
-
// and reduce the cache misses originating from executing an arbitrary
|
|
285
|
-
// lambda
|
|
286
|
-
//
|
|
287
|
-
// note that this is an anonymous union, not an unnamed union, the members
|
|
288
|
-
// leak into the surrounding scope
|
|
289
|
-
union {
|
|
290
|
-
// metadata for the waker
|
|
291
|
-
WakerMetadata<Atomic> metadata_;
|
|
292
|
-
// The critical section that can potentially be combined into the critical
|
|
293
|
-
// section of the locking thread
|
|
294
|
-
//
|
|
295
|
-
// This is kept as a FunctionRef because the original function is preserved
|
|
296
|
-
// until the lock_combine() function returns. A consequence of using
|
|
297
|
-
// FunctionRef here is that we don't need to do any allocations and can
|
|
298
|
-
// allow users to capture unbounded state into the critical section. Flat
|
|
299
|
-
// combining means that the user does not have access to the thread
|
|
300
|
-
// executing the critical section, so assumptions about thread local
|
|
301
|
-
// references can be invalidated. Being able to capture arbitrary state
|
|
302
|
-
// allows the user to do thread local accesses right before the critical
|
|
303
|
-
// section and pass them as state to the callable being referenced here
|
|
304
|
-
CombineFunction function_;
|
|
305
|
-
// The user is allowed to use a combined critical section that returns a
|
|
306
|
-
// value. This buffer is used to implement the value transfer to the
|
|
307
|
-
// waiting thread. We reuse the same union because this helps us combine
|
|
308
|
-
// one synchronization operation with a material value transfer.
|
|
309
|
-
//
|
|
310
|
-
// The waker thread needs to synchronize on this cacheline to issue a
|
|
311
|
-
// wakeup to the waiter, meaning that the entire line needs to be pulled
|
|
312
|
-
// into the remote core in exclusive mode. So we reuse the coherence
|
|
313
|
-
// operation to transfer the return value in addition to the
|
|
314
|
-
// synchronization signal. In the case that the user's data item is
|
|
315
|
-
// small, the data is transferred all inline as part of the same line,
|
|
316
|
-
// which pretty much arrives into the CPU cache in the same clock cycle or
|
|
317
|
-
// two after a read-for-ownership request. This gives us a high chance of
|
|
318
|
-
// coalescing the entire transitive store buffer together into one cache
|
|
319
|
-
// coherence operation from the waker's end. This allows us to make use
|
|
320
|
-
// of the CPU bus bandwidth which would have otherwise gone to waste.
|
|
321
|
-
// Benchmarks prove this theory under a wide range of contention, value
|
|
322
|
-
// sizes, NUMA interactions and processor models
|
|
323
|
-
//
|
|
324
|
-
// The current version of the Intel optimization manual confirms this
|
|
325
|
-
// theory somewhat as well in section 2.3.5.1 (Load and Store Operation
|
|
326
|
-
// Overview)
|
|
327
|
-
//
|
|
328
|
-
// When an instruction writes data to a memory location [...], the
|
|
329
|
-
// processor ensures that it has the line containing this memory location
|
|
330
|
-
// is in its L1d cache [...]. If the cache line is not there, it fetches
|
|
331
|
-
// from the next levels using a RFO request [...] RFO and storing the
|
|
332
|
-
// data happens after instruction retirement. Therefore, the store
|
|
333
|
-
// latency usually does not affect the store instruction itself
|
|
334
|
-
//
|
|
335
|
-
// This gives the user the ability to input up to 48 bytes into the
|
|
336
|
-
// combined critical section through an InlineFunctionRef and output 48
|
|
337
|
-
// bytes from it basically without any cost. The type of the entity
|
|
338
|
-
// stored in the buffer has to be matched by the type erased callable that
|
|
339
|
-
// the caller has used. At this point, the caller is still in the
|
|
340
|
-
// template instantiation leading to the combine request, so it has
|
|
341
|
-
// knowledge of the return type and can apply the appropriate
|
|
342
|
-
// reinterpret_cast and launder operation to safely retrieve the data from
|
|
343
|
-
// this buffer
|
|
344
|
-
_t<std::aligned_storage<48, 8>> storage_;
|
|
345
|
-
};
|
|
346
|
-
std::array<std::uint8_t, hardware_destructive_interference_size> padding2;
|
|
347
|
-
};
|
|
348
|
-
|
|
349
|
-
/**
|
|
350
|
-
* A template that helps us differentiate between the different ways to return
|
|
351
|
-
* a value from a combined critical section. A return value of type void
|
|
352
|
-
* cannot be stored anywhere, so we use specializations and pick the right one
|
|
353
|
-
* switched through std::conditional_t
|
|
354
|
-
*
|
|
355
|
-
* This is then used by CoalescedTask and its family of functions to implement
|
|
356
|
-
* efficient return value transfers to the waiting threads
|
|
357
|
-
*/
|
|
358
|
-
template <typename Func>
|
|
359
|
-
class RequestWithReturn {
|
|
360
|
-
public:
|
|
361
|
-
using F = Func;
|
|
362
|
-
using ReturnType = decltype(std::declval<const Func&>()());
|
|
363
|
-
explicit RequestWithReturn(Func func) : func_{std::move(func)} {}
|
|
364
|
-
|
|
365
|
-
/**
|
|
366
|
-
* We need to define the destructor here because C++ requires (with good
|
|
367
|
-
* reason) that a union with non-default destructor be explicitly destroyed
|
|
368
|
-
* from the surrounding class, as neither the runtime nor compiler have the
|
|
369
|
-
* knowledge of what to do with a union at the time of destruction
|
|
370
|
-
*
|
|
371
|
-
* Each request that has a valid return value set will have the value
|
|
372
|
-
* retrieved from the get() method, where the value is destroyed. So we
|
|
373
|
-
* don't need to destroy it here
|
|
374
|
-
*/
|
|
375
|
-
~RequestWithReturn() {}
|
|
376
|
-
|
|
377
|
-
/**
|
|
378
|
-
* This method can be used to return a value from the request. This returns
|
|
379
|
-
* the underlying value because return type of the function we were
|
|
380
|
-
* instantiated with is not void
|
|
381
|
-
*/
|
|
382
|
-
ReturnType get() && {
|
|
383
|
-
// when the return value has been processed, we destroy the value
|
|
384
|
-
// contained in this request. Using a scope_exit means that we don't have
|
|
385
|
-
// to worry about storing the value somewhere and causing potentially an
|
|
386
|
-
// extra move
|
|
387
|
-
//
|
|
388
|
-
// note that the invariant here is that this function is only called if the
|
|
389
|
-
// requesting thread had it's critical section combined, and the value_
|
|
390
|
-
// member constructed through detach()
|
|
391
|
-
SCOPE_EXIT {
|
|
392
|
-
value_.~ReturnType();
|
|
393
|
-
};
|
|
394
|
-
return std::move(value_);
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
// this contains a copy of the function the waiter had requested to be
|
|
398
|
-
// executed as a combined critical section
|
|
399
|
-
Func func_;
|
|
400
|
-
// this stores the return value used in the request, we use a union here to
|
|
401
|
-
// avoid laundering and allow return types that are not default
|
|
402
|
-
// constructible to be propagated through the execution of the critical
|
|
403
|
-
// section
|
|
404
|
-
//
|
|
405
|
-
// note that this is an anonymous union, the member leaks into the
|
|
406
|
-
// surrounding scope as a member variable
|
|
407
|
-
union {
|
|
408
|
-
ReturnType value_;
|
|
409
|
-
};
|
|
410
|
-
};
|
|
411
|
-
|
|
412
|
-
template <typename Func>
|
|
413
|
-
class RequestWithoutReturn {
|
|
414
|
-
public:
|
|
415
|
-
using F = Func;
|
|
416
|
-
using ReturnType = void;
|
|
417
|
-
explicit RequestWithoutReturn(Func func) : func_{std::move(func)} {}
|
|
418
|
-
|
|
419
|
-
/**
|
|
420
|
-
* In this version of the request class, get() returns nothing as there is
|
|
421
|
-
* no stored value
|
|
422
|
-
*/
|
|
423
|
-
void get() && {}
|
|
424
|
-
|
|
425
|
-
// this contains a copy of the function the waiter had requested to be
|
|
426
|
-
// executed as a combined critical section
|
|
427
|
-
Func func_;
|
|
428
|
-
};
|
|
429
|
-
|
|
430
|
-
// we need to use std::integral_constant::value here as opposed to
|
|
431
|
-
// std::integral_constant::operator T() because MSVC errors out with the
|
|
432
|
-
// implicit conversion
|
|
433
|
-
template <typename Func>
|
|
434
|
-
using Request = _t<std::conditional<
|
|
435
|
-
std::is_void<decltype(std::declval<const Func&>()())>::value,
|
|
436
|
-
RequestWithoutReturn<Func>,
|
|
437
|
-
RequestWithReturn<Func>>>;
|
|
438
|
-
|
|
439
|
-
/**
|
|
440
|
-
* A template that helps us to transform a callable returning a value to one
|
|
441
|
-
* that returns void so it can be type erased and passed on to the waker. If
|
|
442
|
-
* the return value is small enough, it gets coalesced into the wait struct
|
|
443
|
-
* for optimal data transfer. When it's not small enough to fit in the waiter
|
|
444
|
-
* storage buffer, we place it on it's own cacheline with isolation to prevent
|
|
445
|
-
* false-sharing with the on-stack metadata of the waiter thread
|
|
446
|
-
*
|
|
447
|
-
* This helps a combined critical section feel more normal in the case where
|
|
448
|
-
* the user wants to return a value, for example
|
|
449
|
-
*
|
|
450
|
-
* auto value = mutex_.lock_combine([&]() {
|
|
451
|
-
* return data_.value();
|
|
452
|
-
* });
|
|
453
|
-
*
|
|
454
|
-
* Without this, the user would typically create a dummy object that they
|
|
455
|
-
* would then assign to from within the lambda. With return value chaining,
|
|
456
|
-
* this pattern feels more natural
|
|
457
|
-
*
|
|
458
|
-
* Note that it is important to copy the entire callble into this class.
|
|
459
|
-
* Storing something like a reference instead is not desirable because it does
|
|
460
|
-
* not allow InlineFunctionRef to use inline storage to represent the user's
|
|
461
|
-
* callable without extra indirections
|
|
462
|
-
*
|
|
463
|
-
* We use std::conditional_t and switch to the right type of task with the
|
|
464
|
-
* CoalescedTask type alias
|
|
465
|
-
*/
|
|
466
|
-
template <typename Func, typename Waiter>
|
|
467
|
-
class TaskWithCoalesce {
|
|
468
|
-
public:
|
|
469
|
-
using ReturnType = decltype(std::declval<const Func&>()());
|
|
470
|
-
using StorageType = folly::Unit;
|
|
471
|
-
explicit TaskWithCoalesce(Func func, Waiter& waiter)
|
|
472
|
-
: func_{std::move(func)}, waiter_(waiter) {}
|
|
473
|
-
|
|
474
|
-
void operator()() const {
|
|
475
|
-
auto value = func_();
|
|
476
|
-
new (&waiter_.storage_) ReturnType(std::move(value));
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
private:
|
|
480
|
-
Func func_;
|
|
481
|
-
Waiter& waiter_;
|
|
482
|
-
|
|
483
|
-
static_assert(!std::is_void<ReturnType>{}, "");
|
|
484
|
-
static_assert(alignof(decltype(waiter_.storage_)) >= alignof(ReturnType), "");
|
|
485
|
-
static_assert(sizeof(decltype(waiter_.storage_)) >= sizeof(ReturnType), "");
|
|
486
|
-
};
|
|
487
|
-
|
|
488
|
-
template <typename Func, typename Waiter>
|
|
489
|
-
class TaskWithoutCoalesce {
|
|
490
|
-
public:
|
|
491
|
-
using ReturnType = void;
|
|
492
|
-
using StorageType = folly::Unit;
|
|
493
|
-
explicit TaskWithoutCoalesce(Func func, Waiter&) : func_{std::move(func)} {}
|
|
494
|
-
|
|
495
|
-
void operator()() const {
|
|
496
|
-
func_();
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
private:
|
|
500
|
-
Func func_;
|
|
501
|
-
};
|
|
502
|
-
|
|
503
|
-
template <typename Func, typename Waiter>
|
|
504
|
-
class TaskWithBigReturnValue {
|
|
505
|
-
public:
|
|
506
|
-
// Using storage that is aligned on the cacheline boundary helps us avoid a
|
|
507
|
-
// situation where the data ends up being allocated on two separate
|
|
508
|
-
// cachelines. This would require the remote thread to pull in both lines
|
|
509
|
-
// to issue a write.
|
|
510
|
-
//
|
|
511
|
-
// We also isolate the storage by appending some padding to the end to
|
|
512
|
-
// ensure we avoid false-sharing with the metadata used while the waiter
|
|
513
|
-
// waits
|
|
514
|
-
using ReturnType = decltype(std::declval<const Func&>()());
|
|
515
|
-
static const auto kReturnValueAlignment = folly::kIsMsvc
|
|
516
|
-
? 8
|
|
517
|
-
: folly::constexpr_max(
|
|
518
|
-
alignof(ReturnType),
|
|
519
|
-
folly::hardware_destructive_interference_size);
|
|
520
|
-
using StorageType = _t<std::aligned_storage<
|
|
521
|
-
sizeof(
|
|
522
|
-
_t<std::aligned_storage<sizeof(ReturnType), kReturnValueAlignment>>),
|
|
523
|
-
kReturnValueAlignment>>;
|
|
524
|
-
|
|
525
|
-
explicit TaskWithBigReturnValue(Func func, Waiter&)
|
|
526
|
-
: func_{std::move(func)} {}
|
|
527
|
-
|
|
528
|
-
void operator()() const {
|
|
529
|
-
assert(storage_);
|
|
530
|
-
auto value = func_();
|
|
531
|
-
new (storage_) ReturnType(std::move(value));
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
void attach(StorageType* storage) {
|
|
535
|
-
assert(!storage_);
|
|
536
|
-
storage_ = storage;
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
private:
|
|
540
|
-
Func func_;
|
|
541
|
-
StorageType* storage_{nullptr};
|
|
542
|
-
|
|
543
|
-
static_assert(!std::is_void<ReturnType>{}, "");
|
|
544
|
-
static_assert(sizeof(Waiter::storage_) < sizeof(ReturnType), "");
|
|
545
|
-
};
|
|
546
|
-
|
|
547
|
-
template <typename T, bool>
|
|
548
|
-
struct Sizeof_;
|
|
549
|
-
template <typename T>
|
|
550
|
-
struct Sizeof_<T, false> : std::integral_constant<std::size_t, sizeof(T)> {};
|
|
551
|
-
template <typename T>
|
|
552
|
-
struct Sizeof_<T, true> : std::integral_constant<std::size_t, 0> {};
|
|
553
|
-
template <typename T>
|
|
554
|
-
struct Sizeof : Sizeof_<T, std::is_void<T>::value> {};
|
|
555
|
-
|
|
556
|
-
// we need to use std::integral_constant::value here as opposed to
|
|
557
|
-
// std::integral_constant::operator T() because MSVC errors out with the
|
|
558
|
-
// implicit conversion
|
|
559
|
-
template <typename Func, typename Waiter>
|
|
560
|
-
using CoalescedTask = _t<std::conditional<
|
|
561
|
-
std::is_void<decltype(std::declval<const Func&>()())>::value,
|
|
562
|
-
TaskWithoutCoalesce<Func, Waiter>,
|
|
563
|
-
_t<std::conditional<
|
|
564
|
-
Sizeof<decltype(std::declval<const Func&>()())>::value <=
|
|
565
|
-
sizeof(Waiter::storage_),
|
|
566
|
-
TaskWithCoalesce<Func, Waiter>,
|
|
567
|
-
TaskWithBigReturnValue<Func, Waiter>>>>>;
|
|
568
|
-
|
|
569
|
-
/**
|
|
570
|
-
* Given a request and a wait node, coalesce them into a CoalescedTask that
|
|
571
|
-
* coalesces the return value into the wait node when invoked from a remote
|
|
572
|
-
* thread
|
|
573
|
-
*
|
|
574
|
-
* When given a null request through nullptr_t, coalesce() returns null as well
|
|
575
|
-
*/
|
|
576
|
-
template <typename Waiter>
|
|
577
|
-
std::nullptr_t coalesce(std::nullptr_t&, Waiter&) {
|
|
578
|
-
return nullptr;
|
|
579
|
-
}
|
|
580
|
-
|
|
581
|
-
template <
|
|
582
|
-
typename Request,
|
|
583
|
-
typename Waiter,
|
|
584
|
-
typename Func = typename Request::F>
|
|
585
|
-
CoalescedTask<Func, Waiter> coalesce(Request& request, Waiter& waiter) {
|
|
586
|
-
static_assert(!std::is_same<Request, std::nullptr_t>{}, "");
|
|
587
|
-
return CoalescedTask<Func, Waiter>{request.func_, waiter};
|
|
588
|
-
}
|
|
589
|
-
|
|
590
|
-
/**
|
|
591
|
-
* Given a task, create storage for the return value. When we get a type
|
|
592
|
-
* of CoalescedTask, this returns an instance of CoalescedTask::StorageType.
|
|
593
|
-
* std::nullptr_t otherwise
|
|
594
|
-
*/
|
|
595
|
-
inline std::nullptr_t makeReturnValueStorageFor(std::nullptr_t&) {
|
|
596
|
-
return {};
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
template <
|
|
600
|
-
typename CoalescedTask,
|
|
601
|
-
typename StorageType = typename CoalescedTask::StorageType>
|
|
602
|
-
StorageType makeReturnValueStorageFor(CoalescedTask&) {
|
|
603
|
-
return {};
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
/**
|
|
607
|
-
* Given a task and storage, attach them together if needed. This only helps
|
|
608
|
-
* when we have a task that returns a value bigger than can be coalesced. In
|
|
609
|
-
* that case, we need to attach the storage with the task so the return value
|
|
610
|
-
* can be transferred to this thread from the remote thread
|
|
611
|
-
*/
|
|
612
|
-
template <typename Task, typename Storage>
|
|
613
|
-
void attach(Task&, Storage&) {
|
|
614
|
-
static_assert(
|
|
615
|
-
std::is_same<Storage, std::nullptr_t>{} ||
|
|
616
|
-
std::is_same<Storage, folly::Unit>{},
|
|
617
|
-
"");
|
|
618
|
-
}
|
|
619
|
-
|
|
620
|
-
template <
|
|
621
|
-
typename R,
|
|
622
|
-
typename W,
|
|
623
|
-
typename StorageType = typename TaskWithBigReturnValue<R, W>::StorageType>
|
|
624
|
-
void attach(TaskWithBigReturnValue<R, W>& task, StorageType& storage) {
|
|
625
|
-
task.attach(&storage);
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
template <typename Request, typename Waiter>
|
|
629
|
-
void throwIfExceptionOccurred(Request&, Waiter& waiter, bool exception) {
|
|
630
|
-
using Storage = decltype(waiter.storage_);
|
|
631
|
-
using F = typename Request::F;
|
|
632
|
-
static_assert(sizeof(Storage) >= sizeof(std::exception_ptr), "");
|
|
633
|
-
static_assert(alignof(Storage) >= alignof(std::exception_ptr), "");
|
|
634
|
-
|
|
635
|
-
// we only need to check for an exception in the waiter struct if the passed
|
|
636
|
-
// callable is not noexcept
|
|
637
|
-
//
|
|
638
|
-
// we need to make another instance of the exception with automatic storage
|
|
639
|
-
// duration and destroy the exception held in the storage *before throwing* to
|
|
640
|
-
// avoid leaks. If we don't destroy the exception_ptr in storage, the
|
|
641
|
-
// refcount for the internal exception will never hit zero, thereby leaking
|
|
642
|
-
// memory
|
|
643
|
-
if ((!noexcept(std::declval<const F&>()()) && exception)) {
|
|
644
|
-
auto storage = &waiter.storage_;
|
|
645
|
-
auto exc = folly::launder(reinterpret_cast<std::exception_ptr*>(storage));
|
|
646
|
-
auto copy = std::move(*exc);
|
|
647
|
-
exc->std::exception_ptr::~exception_ptr();
|
|
648
|
-
std::rethrow_exception(std::move(copy));
|
|
649
|
-
}
|
|
650
|
-
}
|
|
651
|
-
|
|
652
|
-
/**
|
|
653
|
-
* Given a CoalescedTask, a wait node and a request. Detach the return value
|
|
654
|
-
* into the request from the wait node and task.
|
|
655
|
-
*/
|
|
656
|
-
template <typename Waiter>
|
|
657
|
-
void detach(std::nullptr_t&, Waiter&, bool exception, std::nullptr_t&) {
|
|
658
|
-
assert(!exception);
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
template <typename Waiter, typename F>
|
|
662
|
-
void detach(
|
|
663
|
-
RequestWithoutReturn<F>& request,
|
|
664
|
-
Waiter& waiter,
|
|
665
|
-
bool exception,
|
|
666
|
-
folly::Unit&) {
|
|
667
|
-
throwIfExceptionOccurred(request, waiter, exception);
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
template <typename Waiter, typename F>
|
|
671
|
-
void detach(
|
|
672
|
-
RequestWithReturn<F>& request,
|
|
673
|
-
Waiter& waiter,
|
|
674
|
-
bool exception,
|
|
675
|
-
folly::Unit&) {
|
|
676
|
-
throwIfExceptionOccurred(request, waiter, exception);
|
|
677
|
-
|
|
678
|
-
using ReturnType = typename RequestWithReturn<F>::ReturnType;
|
|
679
|
-
static_assert(!std::is_same<ReturnType, void>{}, "");
|
|
680
|
-
static_assert(sizeof(waiter.storage_) >= sizeof(ReturnType), "");
|
|
681
|
-
|
|
682
|
-
auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&waiter.storage_));
|
|
683
|
-
new (&request.value_) ReturnType(std::move(val));
|
|
684
|
-
val.~ReturnType();
|
|
685
|
-
}
|
|
686
|
-
|
|
687
|
-
template <typename Waiter, typename F, typename Storage>
|
|
688
|
-
void detach(
|
|
689
|
-
RequestWithReturn<F>& request,
|
|
690
|
-
Waiter& waiter,
|
|
691
|
-
bool exception,
|
|
692
|
-
Storage& storage) {
|
|
693
|
-
throwIfExceptionOccurred(request, waiter, exception);
|
|
694
|
-
|
|
695
|
-
using ReturnType = typename RequestWithReturn<F>::ReturnType;
|
|
696
|
-
static_assert(!std::is_same<ReturnType, void>{}, "");
|
|
697
|
-
static_assert(sizeof(storage) >= sizeof(ReturnType), "");
|
|
698
|
-
|
|
699
|
-
auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&storage));
|
|
700
|
-
new (&request.value_) ReturnType(std::move(val));
|
|
701
|
-
val.~ReturnType();
|
|
702
|
-
}
|
|
703
|
-
|
|
704
|
-
/**
|
|
705
|
-
* Get the time since epoch in nanoseconds
|
|
706
|
-
*
|
|
707
|
-
* This is faster than std::chrono::steady_clock because it avoids a VDSO
|
|
708
|
-
* access to get the timestamp counter
|
|
709
|
-
*
|
|
710
|
-
* Note that the hardware timestamp counter on x86, like std::steady_clock is
|
|
711
|
-
* guaranteed to be monotonically increasing -
|
|
712
|
-
* https://c9x.me/x86/html/file_module_x86_id_278.html
|
|
713
|
-
*/
|
|
714
|
-
inline std::chrono::nanoseconds time() {
|
|
715
|
-
return std::chrono::nanoseconds{hardware_timestamp()};
|
|
716
|
-
}
|
|
717
|
-
|
|
718
|
-
/**
|
|
719
|
-
* Zero out the other bits used by the implementation and return just an
|
|
720
|
-
* address from a uintptr_t
|
|
721
|
-
*/
|
|
722
|
-
template <typename Type>
|
|
723
|
-
Type* extractPtr(std::uintptr_t from) {
|
|
724
|
-
// shift one bit off the end, to get all 1s followed by a single 0
|
|
725
|
-
auto mask = std::numeric_limits<std::uintptr_t>::max();
|
|
726
|
-
mask >>= 1;
|
|
727
|
-
mask <<= 1;
|
|
728
|
-
assert(!(mask & 0b1));
|
|
729
|
-
|
|
730
|
-
return folly::bit_cast<Type*>(from & mask);
|
|
731
|
-
}
|
|
732
|
-
|
|
733
|
-
/**
|
|
734
|
-
* Strips the given nanoseconds into only the least significant 56 bits by
|
|
735
|
-
* moving the least significant 56 bits over by 8 zeroing out the bottom 8
|
|
736
|
-
* bits to be used as a medium of information transfer for the thread wait
|
|
737
|
-
* nodes
|
|
738
|
-
*/
|
|
739
|
-
inline std::uint64_t strip(std::chrono::nanoseconds t) {
|
|
740
|
-
auto time = t.count();
|
|
741
|
-
return static_cast<std::uint64_t>(time) << 8;
|
|
742
|
-
}
|
|
743
|
-
|
|
744
|
-
/**
|
|
745
|
-
* Recover the timestamp value from an integer that has the timestamp encoded
|
|
746
|
-
* in it
|
|
747
|
-
*/
|
|
748
|
-
inline std::uint64_t recover(std::uint64_t from) {
|
|
749
|
-
return from >> 8;
|
|
750
|
-
}
|
|
751
|
-
|
|
752
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
753
|
-
class DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy {
|
|
754
|
-
public:
|
|
755
|
-
// DistributedMutexStateProxy is move constructible and assignable for
|
|
756
|
-
// convenience
|
|
757
|
-
DistributedMutexStateProxy(DistributedMutexStateProxy&& other) {
|
|
758
|
-
*this = std::move(other);
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
DistributedMutexStateProxy& operator=(DistributedMutexStateProxy&& other) {
|
|
762
|
-
assert(!(*this));
|
|
763
|
-
|
|
764
|
-
next_ = folly::exchange(other.next_, nullptr);
|
|
765
|
-
expected_ = folly::exchange(other.expected_, 0);
|
|
766
|
-
timedWaiters_ = folly::exchange(other.timedWaiters_, false);
|
|
767
|
-
combined_ = folly::exchange(other.combined_, false);
|
|
768
|
-
waker_ = folly::exchange(other.waker_, 0);
|
|
769
|
-
waiters_ = folly::exchange(other.waiters_, nullptr);
|
|
770
|
-
ready_ = folly::exchange(other.ready_, nullptr);
|
|
771
|
-
|
|
772
|
-
return *this;
|
|
773
|
-
}
|
|
774
|
-
|
|
775
|
-
// The proxy is valid when a mutex acquisition attempt was successful,
|
|
776
|
-
// lock() is guaranteed to return a valid proxy, try_lock() is not
|
|
777
|
-
explicit operator bool() const {
|
|
778
|
-
return expected_;
|
|
779
|
-
}
|
|
780
|
-
|
|
781
|
-
// private:
|
|
782
|
-
// friend the mutex class, since that will be accessing state private to
|
|
783
|
-
// this class
|
|
784
|
-
friend class DistributedMutex<Atomic, TimePublishing>;
|
|
785
|
-
|
|
786
|
-
DistributedMutexStateProxy(
|
|
787
|
-
Waiter<Atomic>* next,
|
|
788
|
-
std::uintptr_t expected,
|
|
789
|
-
bool timedWaiter = false,
|
|
790
|
-
bool combined = false,
|
|
791
|
-
std::uintptr_t waker = 0,
|
|
792
|
-
Waiter<Atomic>* waiters = nullptr,
|
|
793
|
-
Waiter<Atomic>* ready = nullptr)
|
|
794
|
-
: next_{next},
|
|
795
|
-
expected_{expected},
|
|
796
|
-
timedWaiters_{timedWaiter},
|
|
797
|
-
combined_{combined},
|
|
798
|
-
waker_{waker},
|
|
799
|
-
waiters_{waiters},
|
|
800
|
-
ready_{ready} {}
|
|
801
|
-
|
|
802
|
-
// the next thread that is to be woken up, this being null at the time of
|
|
803
|
-
// unlock() shows that the current thread acquired the mutex without
|
|
804
|
-
// contention or it was the terminal thread in the queue of threads waking up
|
|
805
|
-
Waiter<Atomic>* next_{nullptr};
|
|
806
|
-
// this is the value that the current thread should expect to find on
|
|
807
|
-
// unlock, and if this value is not there on unlock, the current thread
|
|
808
|
-
// should assume that other threads are enqueued waiting for the mutex
|
|
809
|
-
//
|
|
810
|
-
// note that if the mutex has the same state set at unlock time, and this is
|
|
811
|
-
// set to an address (and not say kLocked in the case of a terminal waker)
|
|
812
|
-
// then it must have been the case that no other thread had enqueued itself,
|
|
813
|
-
// since threads in the domain of this mutex do not share stack space
|
|
814
|
-
//
|
|
815
|
-
// if we want to support stack sharing, we can solve the problem by looping
|
|
816
|
-
// at lock time, and setting a variable that says whether we have acquired
|
|
817
|
-
// the lock or not perhaps
|
|
818
|
-
std::uintptr_t expected_{0};
|
|
819
|
-
// a boolean that will be set when the mutex has timed waiters that the
|
|
820
|
-
// current thread is responsible for waking, in such a case, the current
|
|
821
|
-
// thread will issue an atomic_notify_one() call after unlocking the mutex
|
|
822
|
-
//
|
|
823
|
-
// note that a timed waiter will itself always have this flag set. This is
|
|
824
|
-
// done so we can avoid having to issue a atomic_notify_all() call (and
|
|
825
|
-
// subsequently a thundering herd) when waking up timed-wait threads
|
|
826
|
-
bool timedWaiters_{false};
|
|
827
|
-
// a boolean that contains true if the state proxy is not meant to be passed
|
|
828
|
-
// to the unlock() function. This is set only when there is contention and
|
|
829
|
-
// a thread had asked for its critical section to be combined
|
|
830
|
-
bool combined_{false};
|
|
831
|
-
// metadata passed along from the thread that woke this thread up
|
|
832
|
-
std::uintptr_t waker_{0};
|
|
833
|
-
// the list of threads that are waiting on a futex
|
|
834
|
-
//
|
|
835
|
-
// the current threads is meant to wake up this list of waiters if it is
|
|
836
|
-
// able to commit an unlock() on the mutex without seeing a contention chain
|
|
837
|
-
Waiter<Atomic>* waiters_{nullptr};
|
|
838
|
-
// after a thread has woken up from a futex() call, it will have the rest of
|
|
839
|
-
// the threads that it were waiting behind it in this list, a thread that
|
|
840
|
-
// unlocks has to wake up threads from this list if it has any, before it
|
|
841
|
-
// goes to sleep to prevent pathological unfairness
|
|
842
|
-
Waiter<Atomic>* ready_{nullptr};
|
|
843
|
-
};
|
|
844
|
-
|
|
845
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
846
|
-
DistributedMutex<Atomic, TimePublishing>::DistributedMutex()
|
|
847
|
-
: state_{kUnlocked} {}
|
|
848
|
-
|
|
849
|
-
template <typename Waiter>
|
|
850
|
-
std::uint64_t publish(
|
|
851
|
-
std::uint64_t spins,
|
|
852
|
-
bool& shouldPublish,
|
|
853
|
-
std::chrono::nanoseconds& previous,
|
|
854
|
-
Waiter& waiter,
|
|
855
|
-
std::uint32_t waitMode) {
|
|
856
|
-
// time publishing has some overhead because it executes an atomic exchange on
|
|
857
|
-
// the futex word. If this line is in a remote thread (eg. the combiner),
|
|
858
|
-
// then each time we publish a timestamp, this thread has to submit an RFO to
|
|
859
|
-
// the remote core for the cacheline, blocking progress for both threads.
|
|
860
|
-
//
|
|
861
|
-
// the remote core uses a store in the fast path - why then does an RFO make a
|
|
862
|
-
// difference? The only educated guess we have here is that the added
|
|
863
|
-
// roundtrip delays draining of the store buffer, which essentially exerts
|
|
864
|
-
// backpressure on future stores, preventing parallelization
|
|
865
|
-
//
|
|
866
|
-
// if we have requested a combine, time publishing is less important as it
|
|
867
|
-
// only comes into play when the combiner has exhausted their max combine
|
|
868
|
-
// passes. So we defer time publishing to the point when the current thread
|
|
869
|
-
// gets preempted
|
|
870
|
-
auto current = time();
|
|
871
|
-
if ((current - previous) >= kScheduledAwaySpinThreshold) {
|
|
872
|
-
shouldPublish = true;
|
|
873
|
-
}
|
|
874
|
-
previous = current;
|
|
875
|
-
|
|
876
|
-
// if we have requested a combine, and this is the first iteration of the
|
|
877
|
-
// wait-loop, we publish a max timestamp to optimistically convey that we have
|
|
878
|
-
// not yet been preempted (the remote knows the meaning of max timestamps)
|
|
879
|
-
//
|
|
880
|
-
// then if we are under the maximum number of spins allowed before sleeping,
|
|
881
|
-
// we publish the exact timestamp, otherwise we publish the minimum possible
|
|
882
|
-
// timestamp to force the waking thread to skip us
|
|
883
|
-
auto now = ((waitMode == kCombineWaiting) && !spins)
|
|
884
|
-
? decltype(time())::max()
|
|
885
|
-
: (spins < kMaxSpins) ? previous : decltype(time())::zero();
|
|
886
|
-
|
|
887
|
-
// the wait mode information is published in the bottom 8 bits of the futex
|
|
888
|
-
// word, the rest contains time information as computed above. Overflows are
|
|
889
|
-
// not really a correctness concern because time publishing is only a
|
|
890
|
-
// heuristic. This leaves us 56 bits of nanoseconds (2 years) before we hit
|
|
891
|
-
// two consecutive wraparounds, so the lack of bits to respresent time is
|
|
892
|
-
// neither a performance nor correctness concern
|
|
893
|
-
auto data = strip(now) | waitMode;
|
|
894
|
-
auto signal = (shouldPublish || !spins || (waitMode != kCombineWaiting))
|
|
895
|
-
? waiter.futex_.exchange(data, std::memory_order_acq_rel)
|
|
896
|
-
: waiter.futex_.load(std::memory_order_acquire);
|
|
897
|
-
return signal & std::numeric_limits<std::uint8_t>::max();
|
|
898
|
-
}
|
|
899
|
-
|
|
900
|
-
template <typename Waiter>
|
|
901
|
-
bool spin(Waiter& waiter, std::uint32_t& sig, std::uint32_t mode) {
|
|
902
|
-
auto spins = std::uint64_t{0};
|
|
903
|
-
auto waitMode = (mode == kCombineUninitialized) ? kCombineWaiting : kWaiting;
|
|
904
|
-
auto previous = time();
|
|
905
|
-
auto shouldPublish = false;
|
|
906
|
-
while (true) {
|
|
907
|
-
auto signal = publish(spins++, shouldPublish, previous, waiter, waitMode);
|
|
908
|
-
|
|
909
|
-
// if we got skipped, make a note of it and return if we got a skipped
|
|
910
|
-
// signal or a signal to wake up
|
|
911
|
-
auto skipped = (signal == kSkipped);
|
|
912
|
-
auto combined = (signal == kCombined);
|
|
913
|
-
auto exceptionOccurred = (signal == kExceptionOccurred);
|
|
914
|
-
auto woken = (signal == kWake);
|
|
915
|
-
if (skipped || woken || combined || exceptionOccurred) {
|
|
916
|
-
sig = static_cast<std::uint32_t>(signal);
|
|
917
|
-
return !skipped;
|
|
918
|
-
}
|
|
919
|
-
|
|
920
|
-
// if we are under the spin threshold, pause to allow the other
|
|
921
|
-
// hyperthread to run. If not, then sleep
|
|
922
|
-
if (spins < kMaxSpins) {
|
|
923
|
-
asm_volatile_pause();
|
|
924
|
-
} else {
|
|
925
|
-
Sleeper::sleep();
|
|
926
|
-
}
|
|
927
|
-
}
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
template <typename Waiter>
|
|
931
|
-
void doFutexWake(Waiter* waiter) {
|
|
932
|
-
if (waiter) {
|
|
933
|
-
// We can use a simple store operation here and not worry about checking
|
|
934
|
-
// to see if the thread had actually started waiting on the futex, that is
|
|
935
|
-
// already done in tryWake() when a sleeping thread is collected
|
|
936
|
-
//
|
|
937
|
-
// We now do not know whether the waiter had already enqueued on the futex
|
|
938
|
-
// or whether it had just stored kSleeping in its futex and was about to
|
|
939
|
-
// call futexWait(). We treat both these scenarios the same
|
|
940
|
-
//
|
|
941
|
-
// the below can theoretically cause a problem if we set the
|
|
942
|
-
// wake signal and the waiter was in between setting kSleeping in its
|
|
943
|
-
// futex and enqueueing on the futex. In this case the waiter will just
|
|
944
|
-
// return from futexWait() immediately. This leaves the address that the
|
|
945
|
-
// waiter was using for futexWait() possibly dangling, and the thread that
|
|
946
|
-
// we woke in the exchange above might have used that address for some
|
|
947
|
-
// other object
|
|
948
|
-
//
|
|
949
|
-
// however, even if the thread had indeed woken up simply becasue of the
|
|
950
|
-
// above exchange(), the futexWake() below is not incorrect. It is not
|
|
951
|
-
// incorrect because futexWake() does not actually change the memory of
|
|
952
|
-
// the futex word. It just uses the address to do a lookup in the kernel
|
|
953
|
-
// futex table. And even if we call futexWake() on some other address,
|
|
954
|
-
// and that address was being used to wait on futex() that thread will
|
|
955
|
-
// protect itself from spurious wakeups, check the value in the futex word
|
|
956
|
-
// and enqueue itself back on the futex
|
|
957
|
-
//
|
|
958
|
-
// this dangilng pointer possibility is why we use a pointer to the futex
|
|
959
|
-
// word, and avoid dereferencing after the store() operation
|
|
960
|
-
auto sleeper = &waiter->metadata_.sleeper_;
|
|
961
|
-
sleeper->store(kWake, std::memory_order_release);
|
|
962
|
-
futexWake(sleeper, 1);
|
|
963
|
-
}
|
|
964
|
-
}
|
|
965
|
-
|
|
966
|
-
template <typename Waiter>
|
|
967
|
-
bool doFutexWait(Waiter* waiter, Waiter*& next) {
|
|
968
|
-
// first we get ready to sleep by calling exchange() on the futex with a
|
|
969
|
-
// kSleeping value
|
|
970
|
-
assert(waiter->futex_.load(std::memory_order_relaxed) == kAboutToWait);
|
|
971
|
-
|
|
972
|
-
// note the semantics of using a futex here, when we exchange the sleeper_
|
|
973
|
-
// with kSleeping, we are getting ready to sleep, but before sleeping we get
|
|
974
|
-
// ready to sleep, and we return from futexWait() when the value of
|
|
975
|
-
// sleeper_ might have changed. We can also wake up because of a spurious
|
|
976
|
-
// wakeup, so we always check against the value in sleeper_ after returning
|
|
977
|
-
// from futexWait(), if the value is not kWake, then we continue
|
|
978
|
-
auto pre =
|
|
979
|
-
waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
|
|
980
|
-
|
|
981
|
-
// Seeing a kSleeping on a futex word before we set it ourselves means only
|
|
982
|
-
// one thing - an unlocking thread caught us before we went to futex(), and
|
|
983
|
-
// we now have the lock, so we abort
|
|
984
|
-
//
|
|
985
|
-
// if we were given an early delivery, we can return from this function with
|
|
986
|
-
// a true, meaning that we now have the lock
|
|
987
|
-
if (pre == kSleeping) {
|
|
988
|
-
return true;
|
|
989
|
-
}
|
|
990
|
-
|
|
991
|
-
// if we reach here then were were not given an early delivery, and any
|
|
992
|
-
// thread that goes to wake us up will see a consistent view of the rest of
|
|
993
|
-
// the contention chain (since the next_ variable is set before the
|
|
994
|
-
// kSleeping exchange above)
|
|
995
|
-
while (pre != kWake) {
|
|
996
|
-
// before enqueueing on the futex, we wake any waiters that we were
|
|
997
|
-
// possibly responsible for
|
|
998
|
-
doFutexWake(folly::exchange(next, nullptr));
|
|
999
|
-
|
|
1000
|
-
// then we wait on the futex
|
|
1001
|
-
//
|
|
1002
|
-
// note that we have to protect ourselves against spurious wakeups here.
|
|
1003
|
-
// Because the corresponding futexWake() above does not synchronize
|
|
1004
|
-
// wakeups around the futex word. Because doing so would become
|
|
1005
|
-
// inefficient
|
|
1006
|
-
futexWait(&waiter->metadata_.sleeper_, kSleeping);
|
|
1007
|
-
pre = waiter->metadata_.sleeper_.load(std::memory_order_acquire);
|
|
1008
|
-
assert((pre == kSleeping) || (pre == kWake));
|
|
1009
|
-
}
|
|
1010
|
-
|
|
1011
|
-
// when coming out of a futex, we might have some other sleeping threads
|
|
1012
|
-
// that we were supposed to wake up, assign that to the next pointer
|
|
1013
|
-
assert(next == nullptr);
|
|
1014
|
-
next = extractPtr<Waiter>(waiter->next_.load(std::memory_order_relaxed));
|
|
1015
|
-
return false;
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
template <typename Waiter>
|
|
1019
|
-
bool wait(Waiter* waiter, std::uint32_t mode, Waiter*& next, uint32_t& signal) {
|
|
1020
|
-
if (mode == kAboutToWait) {
|
|
1021
|
-
return doFutexWait(waiter, next);
|
|
1022
|
-
}
|
|
1023
|
-
|
|
1024
|
-
return spin(*waiter, signal, mode);
|
|
1025
|
-
}
|
|
1026
|
-
|
|
1027
|
-
inline void recordTimedWaiterAndClearTimedBit(
|
|
1028
|
-
bool& timedWaiter,
|
|
1029
|
-
std::uintptr_t& previous) {
|
|
1030
|
-
// the previous value in the mutex can never be kTimedWaiter, timed waiters
|
|
1031
|
-
// always set (kTimedWaiter | kLocked) in the mutex word when they try and
|
|
1032
|
-
// acquire the mutex
|
|
1033
|
-
assert(previous != kTimedWaiter);
|
|
1034
|
-
|
|
1035
|
-
if ((previous & kTimedWaiter)) {
|
|
1036
|
-
// record whether there was a timed waiter in the previous mutex state, and
|
|
1037
|
-
// clear the timed bit from the previous state
|
|
1038
|
-
timedWaiter = true;
|
|
1039
|
-
previous = previous & (~kTimedWaiter);
|
|
1040
|
-
}
|
|
1041
|
-
}
|
|
1042
|
-
|
|
1043
|
-
template <typename Atomic>
|
|
1044
|
-
void wakeTimedWaiters(Atomic* state, bool timedWaiters) {
|
|
1045
|
-
if ((timedWaiters)) {
|
|
1046
|
-
atomic_notify_one(state);
|
|
1047
|
-
}
|
|
1048
|
-
}
|
|
1049
|
-
|
|
1050
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
1051
|
-
template <typename Func>
|
|
1052
|
-
auto DistributedMutex<Atomic, TimePublishing>::lock_combine(Func func)
|
|
1053
|
-
-> decltype(std::declval<const Func&>()()) {
|
|
1054
|
-
// invoke the lock implementation function and check whether we came out of
|
|
1055
|
-
// it with our task executed as a combined critical section. This usually
|
|
1056
|
-
// happens when the mutex is contended.
|
|
1057
|
-
//
|
|
1058
|
-
// In the absence of contention, we just return from the try_lock() function
|
|
1059
|
-
// with the lock acquired. So we need to invoke the task and unlock
|
|
1060
|
-
// the mutex before returning
|
|
1061
|
-
auto&& task = Request<Func>{func};
|
|
1062
|
-
auto&& state = lockImplementation(*this, state_, task);
|
|
1063
|
-
if (!state.combined_) {
|
|
1064
|
-
// to avoid having to play a return-value dance when the combinable
|
|
1065
|
-
// returns void, we use a scope exit to perform the unlock after the
|
|
1066
|
-
// function return has been processed
|
|
1067
|
-
SCOPE_EXIT {
|
|
1068
|
-
unlock(std::move(state));
|
|
1069
|
-
};
|
|
1070
|
-
return func();
|
|
1071
|
-
}
|
|
1072
|
-
|
|
1073
|
-
// if we are here, that means we were able to get our request combined, we
|
|
1074
|
-
// can return the value that was transferred to us
|
|
1075
|
-
//
|
|
1076
|
-
// each thread that enqueues as a part of a contention chain takes up the
|
|
1077
|
-
// responsibility of any timed waiter that had come immediately before it,
|
|
1078
|
-
// so we wake up timed waiters before exiting the lock function. Another
|
|
1079
|
-
// strategy might be to add the timed waiter information to the metadata and
|
|
1080
|
-
// let a single leader wake up a timed waiter for better concurrency. But
|
|
1081
|
-
// this has proven not to be useful in benchmarks beyond a small 5% delta,
|
|
1082
|
-
// so we avoid taking the complexity hit and branch to wake up timed waiters
|
|
1083
|
-
// from each thread
|
|
1084
|
-
wakeTimedWaiters(&state_, state.timedWaiters_);
|
|
1085
|
-
return std::move(task).get();
|
|
1086
|
-
}
|
|
1087
|
-
|
|
1088
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
1089
|
-
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
|
1090
|
-
DistributedMutex<Atomic, TimePublishing>::lock() {
|
|
1091
|
-
auto null = nullptr;
|
|
1092
|
-
return lockImplementation(*this, state_, null);
|
|
1093
|
-
}
|
|
1094
|
-
|
|
1095
|
-
template <typename Atomic, template <typename> class A, bool T>
|
|
1096
|
-
auto tryLockNoLoad(Atomic& atomic, DistributedMutex<A, T>&)
|
|
1097
|
-
-> typename DistributedMutex<A, T>::DistributedMutexStateProxy {
|
|
1098
|
-
// Try and set the least significant bit of the centralized lock state to 1,
|
|
1099
|
-
// if this succeeds, it must have been the case that we had a kUnlocked (or
|
|
1100
|
-
// 0) in the central storage before, since that is the only case where a 0
|
|
1101
|
-
// can be found in the least significant bit
|
|
1102
|
-
//
|
|
1103
|
-
// If this fails, then it is a no-op
|
|
1104
|
-
using Proxy = typename DistributedMutex<A, T>::DistributedMutexStateProxy;
|
|
1105
|
-
auto previous = atomic_fetch_set(atomic, 0, std::memory_order_acquire);
|
|
1106
|
-
if (!previous) {
|
|
1107
|
-
return Proxy{nullptr, kLocked};
|
|
1108
|
-
}
|
|
1109
|
-
|
|
1110
|
-
return Proxy{nullptr, 0};
|
|
1111
|
-
}
|
|
1112
|
-
|
|
1113
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
1114
|
-
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
|
1115
|
-
DistributedMutex<Atomic, TimePublishing>::try_lock() {
|
|
1116
|
-
// The lock attempt below requires an expensive atomic fetch-and-mutate or
|
|
1117
|
-
// an even more expensive atomic compare-and-swap loop depending on the
|
|
1118
|
-
// platform. These operations require pulling the lock cacheline into the
|
|
1119
|
-
// current core in exclusive mode and are therefore hard to parallelize
|
|
1120
|
-
//
|
|
1121
|
-
// This probabilistically avoids the expense by first checking whether the
|
|
1122
|
-
// mutex is currently locked
|
|
1123
|
-
if (state_.load(std::memory_order_relaxed) != kUnlocked) {
|
|
1124
|
-
return DistributedMutexStateProxy{nullptr, 0};
|
|
1125
|
-
}
|
|
1126
|
-
|
|
1127
|
-
return tryLockNoLoad(state_, *this);
|
|
1128
|
-
}
|
|
1129
|
-
|
|
1130
|
-
template <
|
|
1131
|
-
template <typename> class Atomic,
|
|
1132
|
-
bool TimePublishing,
|
|
1133
|
-
typename State,
|
|
1134
|
-
typename Request>
|
|
1135
|
-
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
|
1136
|
-
lockImplementation(
|
|
1137
|
-
DistributedMutex<Atomic, TimePublishing>& mutex,
|
|
1138
|
-
State& atomic,
|
|
1139
|
-
Request& request) {
|
|
1140
|
-
// first try and acquire the lock as a fast path, the underlying
|
|
1141
|
-
// implementation is slightly faster than using std::atomic::exchange() as
|
|
1142
|
-
// is used in this function. So we get a small perf boost in the
|
|
1143
|
-
// uncontended case
|
|
1144
|
-
//
|
|
1145
|
-
// We only go through this fast path for the lock/unlock usage and avoid this
|
|
1146
|
-
// for combined critical sections. This check adds unnecessary overhead in
|
|
1147
|
-
// that case as it causes an extra cacheline bounce
|
|
1148
|
-
constexpr auto combineRequested = !std::is_same<Request, std::nullptr_t>{};
|
|
1149
|
-
if (!combineRequested) {
|
|
1150
|
-
if (auto state = tryLockNoLoad(atomic, mutex)) {
|
|
1151
|
-
return state;
|
|
1152
|
-
}
|
|
1153
|
-
}
|
|
1154
|
-
|
|
1155
|
-
auto previous = std::uintptr_t{0};
|
|
1156
|
-
auto waitMode = combineRequested ? kCombineUninitialized : kUninitialized;
|
|
1157
|
-
auto nextWaitMode = kAboutToWait;
|
|
1158
|
-
auto timedWaiter = false;
|
|
1159
|
-
Waiter<Atomic>* nextSleeper = nullptr;
|
|
1160
|
-
while (true) {
|
|
1161
|
-
// construct the state needed to wait
|
|
1162
|
-
//
|
|
1163
|
-
// We can't use auto here because MSVC errors out due to a missing copy
|
|
1164
|
-
// constructor
|
|
1165
|
-
Waiter<Atomic> state{};
|
|
1166
|
-
auto&& task = coalesce(request, state);
|
|
1167
|
-
auto&& storage = makeReturnValueStorageFor(task);
|
|
1168
|
-
auto&& address = folly::bit_cast<std::uintptr_t>(&state);
|
|
1169
|
-
attach(task, storage);
|
|
1170
|
-
state.initialize(waitMode, std::move(task));
|
|
1171
|
-
assert(!(address & 0b1));
|
|
1172
|
-
|
|
1173
|
-
// set the locked bit in the address we will be persisting in the mutex
|
|
1174
|
-
address |= kLocked;
|
|
1175
|
-
|
|
1176
|
-
// attempt to acquire the mutex, mutex acquisition is successful if the
|
|
1177
|
-
// previous value is zeroed out
|
|
1178
|
-
//
|
|
1179
|
-
// we use memory_order_acq_rel here because we want the read-modify-write
|
|
1180
|
-
// operation to be both acquire and release. Acquire becasue if this is a
|
|
1181
|
-
// successful lock acquisition, we want to acquire state any other thread
|
|
1182
|
-
// has released from a prior unlock. We want release semantics becasue
|
|
1183
|
-
// other threads that read the address of this value should see the full
|
|
1184
|
-
// well-initialized node we are going to wait on if the mutex acquisition
|
|
1185
|
-
// was unsuccessful
|
|
1186
|
-
previous = atomic.exchange(address, std::memory_order_acq_rel);
|
|
1187
|
-
recordTimedWaiterAndClearTimedBit(timedWaiter, previous);
|
|
1188
|
-
state.next_.store(previous, std::memory_order_relaxed);
|
|
1189
|
-
if (previous == kUnlocked) {
|
|
1190
|
-
return {/* next */ nullptr,
|
|
1191
|
-
/* expected */ address,
|
|
1192
|
-
/* timedWaiter */ timedWaiter,
|
|
1193
|
-
/* combined */ false,
|
|
1194
|
-
/* waker */ 0,
|
|
1195
|
-
/* waiters */ nullptr,
|
|
1196
|
-
/* ready */ nextSleeper};
|
|
1197
|
-
}
|
|
1198
|
-
assert(previous & kLocked);
|
|
1199
|
-
|
|
1200
|
-
// wait until we get a signal from another thread, if this returns false,
|
|
1201
|
-
// we got skipped and had probably been scheduled out, so try again
|
|
1202
|
-
auto signal = kUninitialized;
|
|
1203
|
-
if (!wait(&state, waitMode, nextSleeper, signal)) {
|
|
1204
|
-
std::swap(waitMode, nextWaitMode);
|
|
1205
|
-
continue;
|
|
1206
|
-
}
|
|
1207
|
-
|
|
1208
|
-
// at this point it is safe to access the other fields in the waiter state,
|
|
1209
|
-
// since the thread that woke us up is gone and nobody will be touching this
|
|
1210
|
-
// state again, note that this requires memory ordering, and this is why we
|
|
1211
|
-
// use memory_order_acquire (among other reasons) in the above wait
|
|
1212
|
-
//
|
|
1213
|
-
// first we see if the value we took off the mutex state was the thread that
|
|
1214
|
-
// initated the wakeups, if so, we are the terminal node of the current
|
|
1215
|
-
// contention chain. If we are the terminal node, then we should expect to
|
|
1216
|
-
// see a kLocked in the mutex state when we unlock, if we see that, we can
|
|
1217
|
-
// commit the unlock to the centralized mutex state. If not, we need to
|
|
1218
|
-
// continue wakeups
|
|
1219
|
-
//
|
|
1220
|
-
// a nice consequence of passing kLocked as the current address if we are
|
|
1221
|
-
// the terminal node is that it naturally just works with the algorithm. If
|
|
1222
|
-
// we get a contention chain when coming out of a contention chain, the tail
|
|
1223
|
-
// of the new contention chain will have kLocked set as the previous, which,
|
|
1224
|
-
// as it happens "just works", since we have now established a recursive
|
|
1225
|
-
// relationship until broken
|
|
1226
|
-
auto next = previous;
|
|
1227
|
-
auto expected = address;
|
|
1228
|
-
if (previous == state.metadata_.waker_) {
|
|
1229
|
-
next = 0;
|
|
1230
|
-
expected = kLocked;
|
|
1231
|
-
}
|
|
1232
|
-
|
|
1233
|
-
// if we were given a combine signal, detach the return value from the
|
|
1234
|
-
// wait struct into the request, so the current thread can access it
|
|
1235
|
-
// outside this function
|
|
1236
|
-
auto combined = (signal == kCombined);
|
|
1237
|
-
auto exceptionOccurred = (signal == kExceptionOccurred);
|
|
1238
|
-
if (combined || exceptionOccurred) {
|
|
1239
|
-
detach(request, state, exceptionOccurred, storage);
|
|
1240
|
-
}
|
|
1241
|
-
|
|
1242
|
-
// if we are just coming out of a futex call, then it means that the next
|
|
1243
|
-
// waiter we are responsible for is also a waiter waiting on a futex, so
|
|
1244
|
-
// we return that list in the list of ready threads. We wlil be waking up
|
|
1245
|
-
// the ready threads on unlock no matter what
|
|
1246
|
-
return {/* next */ extractPtr<Waiter<Atomic>>(next),
|
|
1247
|
-
/* expected */ expected,
|
|
1248
|
-
/* timedWaiter */ timedWaiter,
|
|
1249
|
-
/* combined */ combineRequested && (combined || exceptionOccurred),
|
|
1250
|
-
/* waker */ state.metadata_.waker_,
|
|
1251
|
-
/* waiters */ extractPtr<Waiter<Atomic>>(state.metadata_.waiters_),
|
|
1252
|
-
/* ready */ nextSleeper};
|
|
1253
|
-
}
|
|
1254
|
-
}
|
|
1255
|
-
|
|
1256
|
-
inline bool preempted(std::uint64_t value, std::chrono::nanoseconds now) {
|
|
1257
|
-
auto currentTime = recover(strip(now));
|
|
1258
|
-
auto nodeTime = recover(value);
|
|
1259
|
-
auto preempted =
|
|
1260
|
-
(currentTime > nodeTime + kScheduledAwaySpinThreshold.count()) &&
|
|
1261
|
-
(nodeTime != recover(strip(std::chrono::nanoseconds::max())));
|
|
1262
|
-
|
|
1263
|
-
// we say that the thread has been preempted if its timestamp says so, and
|
|
1264
|
-
// also if it is neither uninitialized nor skipped
|
|
1265
|
-
assert(value != kSkipped);
|
|
1266
|
-
return (preempted) && (value != kUninitialized) &&
|
|
1267
|
-
(value != kCombineUninitialized);
|
|
1268
|
-
}
|
|
1269
|
-
|
|
1270
|
-
inline bool isSleeper(std::uintptr_t value) {
|
|
1271
|
-
return (value == kAboutToWait);
|
|
1272
|
-
}
|
|
1273
|
-
|
|
1274
|
-
inline bool isInitialized(std::uintptr_t value) {
|
|
1275
|
-
return (value != kUninitialized) && (value != kCombineUninitialized);
|
|
1276
|
-
}
|
|
1277
|
-
|
|
1278
|
-
inline bool isCombiner(std::uintptr_t value) {
|
|
1279
|
-
auto mode = (value & 0xff);
|
|
1280
|
-
return (mode == kCombineWaiting) || (mode == kCombineUninitialized);
|
|
1281
|
-
}
|
|
1282
|
-
|
|
1283
|
-
inline bool isWaitingCombiner(std::uintptr_t value) {
|
|
1284
|
-
return (value & 0xff) == kCombineWaiting;
|
|
1285
|
-
}
|
|
1286
|
-
|
|
1287
|
-
template <typename Waiter>
|
|
1288
|
-
CombineFunction loadTask(Waiter* current, std::uintptr_t value) {
|
|
1289
|
-
// if we know that the waiter is a combiner of some sort, it is safe to read
|
|
1290
|
-
// and copy the value of the function in the waiter struct, since we know
|
|
1291
|
-
// that a waiter would have set it before enqueueing
|
|
1292
|
-
if (isCombiner(value)) {
|
|
1293
|
-
return current->function_;
|
|
1294
|
-
}
|
|
1295
|
-
|
|
1296
|
-
return nullptr;
|
|
1297
|
-
}
|
|
1298
|
-
|
|
1299
|
-
template <typename Waiter>
|
|
1300
|
-
void transferCurrentException(Waiter* waiter) {
|
|
1301
|
-
assert(std::current_exception());
|
|
1302
|
-
new (&waiter->storage_) std::exception_ptr(std::current_exception());
|
|
1303
|
-
waiter->futex_.store(kExceptionOccurred, std::memory_order_release);
|
|
1304
|
-
}
|
|
1305
|
-
|
|
1306
|
-
template <template <typename> class Atomic>
|
|
1307
|
-
inline std::uintptr_t tryCombine(
|
|
1308
|
-
Waiter<Atomic>* waiter,
|
|
1309
|
-
std::uintptr_t value,
|
|
1310
|
-
std::uintptr_t next,
|
|
1311
|
-
std::uint64_t iteration,
|
|
1312
|
-
std::chrono::nanoseconds now,
|
|
1313
|
-
CombineFunction task) {
|
|
1314
|
-
#ifndef ROCKSDB_LITE
|
|
1315
|
-
// if the waiter has asked for a combine operation, we should combine its
|
|
1316
|
-
// critical section and move on to the next waiter
|
|
1317
|
-
//
|
|
1318
|
-
// the waiter is combinable if the following conditions are satisfied
|
|
1319
|
-
//
|
|
1320
|
-
// 1) the state in the futex word is not uninitialized (kUninitialized)
|
|
1321
|
-
// 2) it has a valid combine function
|
|
1322
|
-
// 3) we are not past the limit of the number of combines we can perform
|
|
1323
|
-
// or the waiter thread been preempted. If the waiter gets preempted,
|
|
1324
|
-
// its better to just execute their critical section before moving on.
|
|
1325
|
-
// As they will have to re-queue themselves after preemption anyway,
|
|
1326
|
-
// leading to further delays in critical section completion
|
|
1327
|
-
//
|
|
1328
|
-
// if all the above are satisfied, then we can combine the critical section.
|
|
1329
|
-
// Note that if the waiter is in a combineable state, that means that it had
|
|
1330
|
-
// finished its writes to both the task and the next_ value. And observing
|
|
1331
|
-
// a waiting state also means that we have acquired the writes to the other
|
|
1332
|
-
// members of the waiter struct, so it's fine to use those values here
|
|
1333
|
-
if (isWaitingCombiner(value) &&
|
|
1334
|
-
(iteration <= kMaxCombineIterations || preempted(value, now))) {
|
|
1335
|
-
try {
|
|
1336
|
-
task();
|
|
1337
|
-
waiter->futex_.store(kCombined, std::memory_order_release);
|
|
1338
|
-
} catch (...) {
|
|
1339
|
-
transferCurrentException(waiter);
|
|
1340
|
-
}
|
|
1341
|
-
return next;
|
|
1342
|
-
}
|
|
1343
|
-
#endif // ROCKSDB_LITE
|
|
1344
|
-
return 0;
|
|
1345
|
-
}
|
|
1346
|
-
|
|
1347
|
-
template <typename Waiter>
|
|
1348
|
-
inline std::uintptr_t tryWake(
|
|
1349
|
-
bool publishing,
|
|
1350
|
-
Waiter* waiter,
|
|
1351
|
-
std::uintptr_t value,
|
|
1352
|
-
std::uintptr_t next,
|
|
1353
|
-
std::uintptr_t waker,
|
|
1354
|
-
Waiter*& sleepers,
|
|
1355
|
-
std::uint64_t iteration,
|
|
1356
|
-
CombineFunction task) {
|
|
1357
|
-
// try and combine the waiter's request first, if that succeeds that means
|
|
1358
|
-
// we have successfully executed their critical section and can move on to
|
|
1359
|
-
// the rest of the chain
|
|
1360
|
-
auto now = time();
|
|
1361
|
-
if (tryCombine(waiter, value, next, iteration, now, task)) {
|
|
1362
|
-
return next;
|
|
1363
|
-
}
|
|
1364
|
-
|
|
1365
|
-
// first we see if we can wake the current thread that is spinning
|
|
1366
|
-
if ((!publishing || !preempted(value, now)) && !isSleeper(value)) {
|
|
1367
|
-
// the Metadata class should be trivially destructible as we use placement
|
|
1368
|
-
// new to set the relevant metadata without calling any destructor. We
|
|
1369
|
-
// need to use placement new because the class contains a futex, which is
|
|
1370
|
-
// non-movable and non-copyable
|
|
1371
|
-
using Metadata = _t<std::decay<decltype(waiter->metadata_)>>;
|
|
1372
|
-
static_assert(std::is_trivially_destructible<Metadata>{}, "");
|
|
1373
|
-
|
|
1374
|
-
// we need release here because of the write to waker_ and also because we
|
|
1375
|
-
// are unlocking the mutex, the thread we do the handoff to here should
|
|
1376
|
-
// see the modified data
|
|
1377
|
-
new (&waiter->metadata_) Metadata(waker, bit_cast<uintptr_t>(sleepers));
|
|
1378
|
-
waiter->futex_.store(kWake, std::memory_order_release);
|
|
1379
|
-
return 0;
|
|
1380
|
-
}
|
|
1381
|
-
|
|
1382
|
-
// if the thread is not a sleeper, and we were not able to catch it before
|
|
1383
|
-
// preemption, we can just return a false, it is safe to read next_ because
|
|
1384
|
-
// the thread was preempted. Preemption signals can only come after the
|
|
1385
|
-
// thread has set the next_ pointer, since the timestamp writes only start
|
|
1386
|
-
// occurring after that point
|
|
1387
|
-
//
|
|
1388
|
-
// if a thread was preempted it must have stored next_ in the waiter struct,
|
|
1389
|
-
// as the store to futex_ that resets the value from kUninitialized happens
|
|
1390
|
-
// after the write to next
|
|
1391
|
-
assert(publishing);
|
|
1392
|
-
if (!isSleeper(value)) {
|
|
1393
|
-
// go on to the next one
|
|
1394
|
-
//
|
|
1395
|
-
// Also, we need a memory_order_release here to prevent missed wakeups. A
|
|
1396
|
-
// missed wakeup here can happen when we see that a thread had been
|
|
1397
|
-
// preempted and skip it. Then go on to release the lock, and then when
|
|
1398
|
-
// the thread which got skipped does an exchange on the central storage,
|
|
1399
|
-
// still sees the locked bit, and never gets woken up
|
|
1400
|
-
//
|
|
1401
|
-
// Can we relax this?
|
|
1402
|
-
assert(preempted(value, now));
|
|
1403
|
-
assert(!isCombiner(value));
|
|
1404
|
-
next = waiter->next_.load(std::memory_order_relaxed);
|
|
1405
|
-
waiter->futex_.store(kSkipped, std::memory_order_release);
|
|
1406
|
-
return next;
|
|
1407
|
-
}
|
|
1408
|
-
|
|
1409
|
-
// if we are here the thread is a sleeper
|
|
1410
|
-
//
|
|
1411
|
-
// we attempt to catch the thread before it goes to futex(). If we are able
|
|
1412
|
-
// to catch the thread before it sleeps on a futex, we are done, and don't
|
|
1413
|
-
// need to go any further
|
|
1414
|
-
//
|
|
1415
|
-
// if we are not able to catch the thread before it goes to futex, we
|
|
1416
|
-
// collect the current thread in the list of sleeping threads represented by
|
|
1417
|
-
// sleepers, and return the next thread in the list and return false along
|
|
1418
|
-
// with the previous next value
|
|
1419
|
-
//
|
|
1420
|
-
// it is safe to read the next_ pointer in the waiter struct if we were
|
|
1421
|
-
// unable to catch the thread before it went to futex() because we use
|
|
1422
|
-
// acquire-release ordering for the exchange operation below. And if we see
|
|
1423
|
-
// that the thread was already sleeping, we have synchronized with the write
|
|
1424
|
-
// to next_ in the context of the sleeping thread
|
|
1425
|
-
//
|
|
1426
|
-
// Also we need to set the value of waiters_ and waker_ in the thread before
|
|
1427
|
-
// doing the exchange because we need to pass on the list of sleepers in the
|
|
1428
|
-
// event that we were able to catch the thread before it went to futex().
|
|
1429
|
-
// If we were unable to catch the thread before it slept, these fields will
|
|
1430
|
-
// be ignored when the thread wakes up anyway
|
|
1431
|
-
assert(isSleeper(value));
|
|
1432
|
-
waiter->metadata_.waker_ = waker;
|
|
1433
|
-
waiter->metadata_.waiters_ = folly::bit_cast<std::uintptr_t>(sleepers);
|
|
1434
|
-
auto pre =
|
|
1435
|
-
waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
|
|
1436
|
-
|
|
1437
|
-
// we were able to catch the thread before it went to sleep, return true
|
|
1438
|
-
if (pre != kSleeping) {
|
|
1439
|
-
return 0;
|
|
1440
|
-
}
|
|
1441
|
-
|
|
1442
|
-
// otherwise return false, with the value of next_, it is safe to read next
|
|
1443
|
-
// because of the same logic as when a thread was preempted
|
|
1444
|
-
//
|
|
1445
|
-
// we also need to collect this sleeper in the list of sleepers being built
|
|
1446
|
-
// up
|
|
1447
|
-
next = waiter->next_.load(std::memory_order_relaxed);
|
|
1448
|
-
auto head = folly::bit_cast<std::uintptr_t>(sleepers);
|
|
1449
|
-
waiter->next_.store(head, std::memory_order_relaxed);
|
|
1450
|
-
sleepers = waiter;
|
|
1451
|
-
return next;
|
|
1452
|
-
}
|
|
1453
|
-
|
|
1454
|
-
template <typename Waiter>
|
|
1455
|
-
bool wake(
|
|
1456
|
-
bool publishing,
|
|
1457
|
-
Waiter& waiter,
|
|
1458
|
-
std::uintptr_t waker,
|
|
1459
|
-
Waiter*& sleepers,
|
|
1460
|
-
std::uint64_t iter) {
|
|
1461
|
-
// loop till we find a node that is either at the end of the list (as
|
|
1462
|
-
// specified by waker) or we find a node that is active (as specified by
|
|
1463
|
-
// the last published timestamp of the node)
|
|
1464
|
-
auto current = &waiter;
|
|
1465
|
-
while (current) {
|
|
1466
|
-
// it is important that we load the value of function and next_ after the
|
|
1467
|
-
// initial acquire load. This is required because we need to synchronize
|
|
1468
|
-
// with the construction of the waiter struct before reading from it
|
|
1469
|
-
//
|
|
1470
|
-
// the load from the next_ variable is an optimistic load that assumes
|
|
1471
|
-
// that the waiting thread has probably gone to the waiting state. If the
|
|
1472
|
-
// waiitng thread is in the waiting state (as revealed by the acquire load
|
|
1473
|
-
// from the futex word), we will see a well formed next_ value because it
|
|
1474
|
-
// happens-before the release store to the futex word. The atomic load from
|
|
1475
|
-
// next_ is an optimization to avoid branching before loading and prevent
|
|
1476
|
-
// the compiler from eliding the load altogether (and using a pointer
|
|
1477
|
-
// dereference when needed)
|
|
1478
|
-
auto value = current->futex_.load(std::memory_order_acquire);
|
|
1479
|
-
auto next = current->next_.load(std::memory_order_relaxed);
|
|
1480
|
-
auto task = loadTask(current, value);
|
|
1481
|
-
next =
|
|
1482
|
-
tryWake(publishing, current, value, next, waker, sleepers, iter, task);
|
|
1483
|
-
|
|
1484
|
-
// if there is no next node, we have managed to wake someone up and have
|
|
1485
|
-
// successfully migrated the lock to another thread
|
|
1486
|
-
if (!next) {
|
|
1487
|
-
return true;
|
|
1488
|
-
}
|
|
1489
|
-
|
|
1490
|
-
// we need to read the value of the next node in the list before skipping
|
|
1491
|
-
// it, this is because after we skip it the node might wake up and enqueue
|
|
1492
|
-
// itself, and thereby gain a new next node
|
|
1493
|
-
assert(publishing);
|
|
1494
|
-
current = (next == waker) ? nullptr : extractPtr<Waiter>(next);
|
|
1495
|
-
}
|
|
1496
|
-
|
|
1497
|
-
return false;
|
|
1498
|
-
}
|
|
1499
|
-
|
|
1500
|
-
template <typename Atomic, typename Proxy, typename Sleepers>
|
|
1501
|
-
bool tryUnlockClean(Atomic& state, Proxy& proxy, Sleepers sleepers) {
|
|
1502
|
-
auto expected = proxy.expected_;
|
|
1503
|
-
while (true) {
|
|
1504
|
-
if (state.compare_exchange_strong(
|
|
1505
|
-
expected,
|
|
1506
|
-
kUnlocked,
|
|
1507
|
-
std::memory_order_release,
|
|
1508
|
-
std::memory_order_relaxed)) {
|
|
1509
|
-
// if we were able to commit an unlocked, we need to wake up the futex
|
|
1510
|
-
// waiters, if any
|
|
1511
|
-
doFutexWake(sleepers);
|
|
1512
|
-
return true;
|
|
1513
|
-
}
|
|
1514
|
-
|
|
1515
|
-
// if we failed the compare_exchange_strong() above, we check to see if
|
|
1516
|
-
// the failure was because of the presence of a timed waiter. If that
|
|
1517
|
-
// was the case then we try one more time with the kTimedWaiter bit set
|
|
1518
|
-
if (expected == (proxy.expected_ | kTimedWaiter)) {
|
|
1519
|
-
proxy.timedWaiters_ = true;
|
|
1520
|
-
continue;
|
|
1521
|
-
}
|
|
1522
|
-
|
|
1523
|
-
// otherwise break, we have a contention chain
|
|
1524
|
-
return false;
|
|
1525
|
-
}
|
|
1526
|
-
}
|
|
1527
|
-
|
|
1528
|
-
template <template <typename> class Atomic, bool Publish>
|
|
1529
|
-
void DistributedMutex<Atomic, Publish>::unlock(
|
|
1530
|
-
typename DistributedMutex::DistributedMutexStateProxy proxy) {
|
|
1531
|
-
// we always wake up ready threads and timed waiters if we saw either
|
|
1532
|
-
assert(proxy);
|
|
1533
|
-
assert(!proxy.combined_);
|
|
1534
|
-
SCOPE_EXIT {
|
|
1535
|
-
doFutexWake(proxy.ready_);
|
|
1536
|
-
wakeTimedWaiters(&state_, proxy.timedWaiters_);
|
|
1537
|
-
};
|
|
1538
|
-
|
|
1539
|
-
// if there is a wait queue we are responsible for, try and start wakeups,
|
|
1540
|
-
// don't bother with the mutex state
|
|
1541
|
-
auto sleepers = proxy.waiters_;
|
|
1542
|
-
if (proxy.next_) {
|
|
1543
|
-
if (wake(Publish, *proxy.next_, proxy.waker_, sleepers, 0)) {
|
|
1544
|
-
return;
|
|
1545
|
-
}
|
|
1546
|
-
|
|
1547
|
-
// At this point, if are in the if statement, we were not the terminal
|
|
1548
|
-
// node of the wakeup chain. Terminal nodes have the next_ pointer set to
|
|
1549
|
-
// null in lock()
|
|
1550
|
-
//
|
|
1551
|
-
// So we need to pretend we were the end of the contention chain. Coming
|
|
1552
|
-
// out of a contention chain always has the kLocked state set in the
|
|
1553
|
-
// mutex. Unless there is another contention chain lined up, which does
|
|
1554
|
-
// not matter since we are the terminal node anyway
|
|
1555
|
-
proxy.expected_ = kLocked;
|
|
1556
|
-
}
|
|
1557
|
-
|
|
1558
|
-
for (std::uint64_t i = 0; true; ++i) {
|
|
1559
|
-
// otherwise, since we don't have anyone we need to wake up, we try and
|
|
1560
|
-
// release the mutex just as is
|
|
1561
|
-
//
|
|
1562
|
-
// if this is successful, we can return, the unlock was successful, we have
|
|
1563
|
-
// committed a nice kUnlocked to the central storage, yay
|
|
1564
|
-
if (tryUnlockClean(state_, proxy, sleepers)) {
|
|
1565
|
-
return;
|
|
1566
|
-
}
|
|
1567
|
-
|
|
1568
|
-
// here we have a contention chain built up on the mutex. We grab the
|
|
1569
|
-
// wait queue and start executing wakeups. We leave a locked bit on the
|
|
1570
|
-
// centralized storage and handoff control to the head of the queue
|
|
1571
|
-
//
|
|
1572
|
-
// we use memory_order_acq_rel here because we want to see the
|
|
1573
|
-
// full well-initialized node that the other thread is waiting on
|
|
1574
|
-
//
|
|
1575
|
-
// If we are unable to wake the contention chain, it is possible that when
|
|
1576
|
-
// we come back to looping here, a new contention chain will form. In
|
|
1577
|
-
// that case we need to use kLocked as the waker_ value because the
|
|
1578
|
-
// terminal node of the new chain will see kLocked in the central storage
|
|
1579
|
-
auto head = state_.exchange(kLocked, std::memory_order_acq_rel);
|
|
1580
|
-
recordTimedWaiterAndClearTimedBit(proxy.timedWaiters_, head);
|
|
1581
|
-
auto next = extractPtr<Waiter<Atomic>>(head);
|
|
1582
|
-
auto expected = folly::exchange(proxy.expected_, kLocked);
|
|
1583
|
-
assert((head & kLocked) && (head != kLocked));
|
|
1584
|
-
if (wake(Publish, *next, expected, sleepers, i)) {
|
|
1585
|
-
break;
|
|
1586
|
-
}
|
|
1587
|
-
}
|
|
1588
|
-
}
|
|
1589
|
-
|
|
1590
|
-
template <typename Atomic, typename Deadline, typename MakeProxy>
|
|
1591
|
-
auto timedLock(Atomic& state, Deadline deadline, MakeProxy proxy)
|
|
1592
|
-
-> decltype(std::declval<MakeProxy&>()(nullptr, kLocked, true)) {
|
|
1593
|
-
while (true) {
|
|
1594
|
-
// we put a bit on the central state to show that there is a timed waiter
|
|
1595
|
-
// and go to sleep on the central state
|
|
1596
|
-
//
|
|
1597
|
-
// when this thread goes to unlock the mutex, it will expect a 0b1 in the
|
|
1598
|
-
// mutex state (0b1, not 0b11), but then it will see that the value in the
|
|
1599
|
-
// mutex state is 0b11 and not 0b1, meaning that there might have been
|
|
1600
|
-
// another timed waiter. Even though there might not have been another
|
|
1601
|
-
// timed waiter in the time being. This sort of missed wakeup is
|
|
1602
|
-
// desirable for timed waiters; it helps avoid thundering herds of timed
|
|
1603
|
-
// waiters. Because the mutex is packed in 8 bytes, and we need an
|
|
1604
|
-
// address to be stored in those 8 bytes, we don't have much room to play
|
|
1605
|
-
// with. The only other solution is to issue a futexWake(INT_MAX) to wake
|
|
1606
|
-
// up all waiters when a clean unlock is committed, when a thread saw a
|
|
1607
|
-
// timed waiter in the mutex previously.
|
|
1608
|
-
//
|
|
1609
|
-
// putting a 0b11 here works for a set of reasons that is a superset of
|
|
1610
|
-
// the set of reasons that make it okay to put a kLocked (0b1) in the
|
|
1611
|
-
// mutex state. Now that the thread has put (kTimedWaiter | kLocked)
|
|
1612
|
-
// (0b11) in the mutex state and it expects a kLocked (0b1), there are two
|
|
1613
|
-
// scenarios possible. The first being when there is no contention chain
|
|
1614
|
-
// formation in the mutex from the time a timed waiter got a lock to
|
|
1615
|
-
// unlock. In this case, the unlocker sees a 0b11 in the mutex state,
|
|
1616
|
-
// adjusts to the presence of a timed waiter and cleanly unlocks with a
|
|
1617
|
-
// kUnlocked (0b0). The second is when there is a contention chain.
|
|
1618
|
-
// When a thread puts its address in the mutex and sees the timed bit, it
|
|
1619
|
-
// records the presence of a timed waiter, and then pretends as if it
|
|
1620
|
-
// hadn't seen the timed bit. So future contention chain releases, will
|
|
1621
|
-
// terminate with a kLocked (0b1) and not a (kLocked | kTimedWaiter)
|
|
1622
|
-
// (0b11). This just works naturally with the rest of the algorithm
|
|
1623
|
-
// without incurring a perf hit for the regular non-timed case
|
|
1624
|
-
//
|
|
1625
|
-
// this strategy does however mean, that when threads try to acquire the
|
|
1626
|
-
// mutex and all time out, there will be a wasteful syscall to issue wakeups
|
|
1627
|
-
// to waiting threads. We don't do anything to try and minimize this
|
|
1628
|
-
//
|
|
1629
|
-
// we need to use a fetch_or() here because we need to convey two bits of
|
|
1630
|
-
// information - 1, whether the mutex is locked or not, and 2, whether
|
|
1631
|
-
// there is a timed waiter. The alternative here is to use the second bit
|
|
1632
|
-
// to convey information only, we can use a fetch_set() on the second bit
|
|
1633
|
-
// to make this faster, but that comes at the expense of requiring regular
|
|
1634
|
-
// fast path lock attempts. Which use a single bit read-modify-write for
|
|
1635
|
-
// better performance
|
|
1636
|
-
auto data = kTimedWaiter | kLocked;
|
|
1637
|
-
auto previous = state.fetch_or(data, std::memory_order_acquire);
|
|
1638
|
-
if (!(previous & 0b1)) {
|
|
1639
|
-
assert(!previous);
|
|
1640
|
-
return proxy(nullptr, kLocked, true);
|
|
1641
|
-
}
|
|
1642
|
-
|
|
1643
|
-
// wait on the futex until signalled, if we get a timeout, the try_lock
|
|
1644
|
-
// fails
|
|
1645
|
-
auto result = atomic_wait_until(&state, previous | data, deadline);
|
|
1646
|
-
if (result == std::cv_status::timeout) {
|
|
1647
|
-
return proxy(nullptr, std::uintptr_t{0}, false);
|
|
1648
|
-
}
|
|
1649
|
-
}
|
|
1650
|
-
}
|
|
1651
|
-
|
|
1652
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
1653
|
-
template <typename Clock, typename Duration>
|
|
1654
|
-
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
|
1655
|
-
DistributedMutex<Atomic, TimePublishing>::try_lock_until(
|
|
1656
|
-
const std::chrono::time_point<Clock, Duration>& deadline) {
|
|
1657
|
-
// fast path for the uncontended case
|
|
1658
|
-
//
|
|
1659
|
-
// we get the time after trying to acquire the mutex because in the
|
|
1660
|
-
// uncontended case, the price of getting the time is about 1/3 of the
|
|
1661
|
-
// actual mutex acquisition. So we only pay the price of that extra bit of
|
|
1662
|
-
// latency when needed
|
|
1663
|
-
//
|
|
1664
|
-
// this is even higher when VDSO is involved on architectures that do not
|
|
1665
|
-
// offer a direct interface to the timestamp counter
|
|
1666
|
-
if (auto state = try_lock()) {
|
|
1667
|
-
return state;
|
|
1668
|
-
}
|
|
1669
|
-
|
|
1670
|
-
// fall back to the timed locking algorithm
|
|
1671
|
-
using Proxy = DistributedMutexStateProxy;
|
|
1672
|
-
return timedLock(
|
|
1673
|
-
state_,
|
|
1674
|
-
deadline,
|
|
1675
|
-
[](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
|
|
1676
|
-
return Proxy{next, expected, timedWaiter};
|
|
1677
|
-
});
|
|
1678
|
-
}
|
|
1679
|
-
|
|
1680
|
-
template <template <typename> class Atomic, bool TimePublishing>
|
|
1681
|
-
template <typename Rep, typename Period>
|
|
1682
|
-
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
|
1683
|
-
DistributedMutex<Atomic, TimePublishing>::try_lock_for(
|
|
1684
|
-
const std::chrono::duration<Rep, Period>& duration) {
|
|
1685
|
-
// fast path for the uncontended case. Reasoning for doing this here is the
|
|
1686
|
-
// same as in try_lock_until()
|
|
1687
|
-
if (auto state = try_lock()) {
|
|
1688
|
-
return state;
|
|
1689
|
-
}
|
|
1690
|
-
|
|
1691
|
-
// fall back to the timed locking algorithm
|
|
1692
|
-
using Proxy = DistributedMutexStateProxy;
|
|
1693
|
-
auto deadline = std::chrono::steady_clock::now() + duration;
|
|
1694
|
-
return timedLock(
|
|
1695
|
-
state_,
|
|
1696
|
-
deadline,
|
|
1697
|
-
[](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
|
|
1698
|
-
return Proxy{next, expected, timedWaiter};
|
|
1699
|
-
});
|
|
1700
|
-
}
|
|
1701
|
-
} // namespace distributed_mutex
|
|
1702
|
-
} // namespace detail
|
|
1703
|
-
} // namespace folly
|