@nxtedition/rocksdb 7.1.14 → 7.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +1 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +72 -18
- package/deps/rocksdb/rocksdb/Makefile +91 -11
- package/deps/rocksdb/rocksdb/TARGETS +8 -4
- package/deps/rocksdb/rocksdb/cache/cache.cc +5 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +13 -8
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +116 -57
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +958 -459
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +407 -622
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +104 -40
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +23 -8
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +350 -184
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +12 -2
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +2 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +130 -43
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +24 -2
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +423 -98
- package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +19 -2
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +10 -7
- package/deps/rocksdb/rocksdb/crash_test.mk +2 -2
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +46 -26
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +9 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_contents.cc +90 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +56 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -10
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +64 -59
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +11 -8
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +92 -62
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +159 -136
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +13 -13
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +129 -57
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +81 -3
- package/deps/rocksdb/rocksdb/db/c.cc +29 -0
- package/deps/rocksdb/rocksdb/db/column_family.cc +10 -1
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +21 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +42 -36
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +344 -102
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +163 -28
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +52 -17
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +35 -30
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +167 -11
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +8 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +10 -13
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +0 -117
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +6 -49
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +29 -4
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +18 -11
- package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +4 -10
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +12 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +144 -93
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +28 -32
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -9
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +2 -33
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +3 -5
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +11 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_iter.cc +76 -138
- package/deps/rocksdb/rocksdb/db/db_iter.h +26 -23
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +931 -0
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +2 -2
- package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +44 -22
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +6 -14
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +155 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +45 -0
- package/deps/rocksdb/rocksdb/db/dbformat.h +2 -1
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +8 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +5 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +24 -12
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +7 -1
- package/deps/rocksdb/rocksdb/db/internal_stats.h +3 -0
- package/deps/rocksdb/rocksdb/db/memtable.cc +79 -18
- package/deps/rocksdb/rocksdb/db/memtable.h +5 -0
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +26 -4
- package/deps/rocksdb/rocksdb/db/memtable_list.h +2 -1
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +113 -0
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.h +110 -0
- package/deps/rocksdb/rocksdb/db/{periodic_work_scheduler_test.cc → periodic_task_scheduler_test.cc} +33 -39
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +12 -20
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +6 -5
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +12 -8
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc +20 -5
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +14 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +17 -8
- package/deps/rocksdb/rocksdb/db/repair_test.cc +2 -1
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +49 -66
- package/deps/rocksdb/rocksdb/db/table_cache.cc +92 -63
- package/deps/rocksdb/rocksdb/db/table_cache.h +16 -9
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -2
- package/deps/rocksdb/rocksdb/db/table_properties_collector.cc +2 -2
- package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -3
- package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_builder.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_edit.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_set.cc +379 -145
- package/deps/rocksdb/rocksdb/db/version_set.h +26 -24
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +9 -9
- package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
- package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +10 -2
- package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +5 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +5 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress.cc +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +71 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +14 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +23 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +26 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +105 -34
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +16 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +4 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +282 -25
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +6 -4
- package/deps/rocksdb/rocksdb/env/io_posix.cc +3 -1
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +367 -177
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +144 -56
- package/deps/rocksdb/rocksdb/file/filename.cc +3 -3
- package/deps/rocksdb/rocksdb/file/filename.h +4 -2
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +415 -0
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +2 -0
- package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +36 -45
- package/deps/rocksdb/rocksdb/file/writable_file_writer.h +21 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +11 -11
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +15 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +163 -68
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +26 -12
- package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +23 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +21 -17
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +17 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/persistent_cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +17 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +20 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +2 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch_base.h +2 -1
- package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -2
- package/deps/rocksdb/rocksdb/monitoring/histogram.cc +4 -2
- package/deps/rocksdb/rocksdb/monitoring/histogram.h +2 -0
- package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +15 -1
- package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.cc +17 -0
- package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +14 -3
- package/deps/rocksdb/rocksdb/monitoring/iostats_context_imp.h +3 -0
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +50 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +1 -0
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +31 -32
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -1
- package/deps/rocksdb/rocksdb/options/options.cc +2 -2
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -1
- package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +1 -0
- package/deps/rocksdb/rocksdb/src.mk +4 -2
- package/deps/rocksdb/rocksdb/table/block_based/block.h +9 -8
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +110 -99
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +12 -10
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +11 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +138 -83
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +25 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +31 -30
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +16 -13
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +4 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +3 -3
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +17 -19
- package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
- package/deps/rocksdb/rocksdb/table/format.cc +26 -29
- package/deps/rocksdb/rocksdb/table/format.h +44 -26
- package/deps/rocksdb/rocksdb/table/get_context.cc +17 -12
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +7 -0
- package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +4 -0
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +950 -104
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +28 -1
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +3 -2
- package/deps/rocksdb/rocksdb/table/meta_blocks.h +1 -1
- package/deps/rocksdb/rocksdb/table/persistent_cache_helper.cc +10 -9
- package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +22 -20
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +1 -1
- package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +1 -1
- package/deps/rocksdb/rocksdb/table/table_builder.h +9 -21
- package/deps/rocksdb/rocksdb/table/table_test.cc +12 -12
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py +4 -4
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +1 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +116 -34
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +6 -1
- package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/util/autovector.h +12 -0
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +3 -2
- package/deps/rocksdb/rocksdb/util/stderr_logger.cc +30 -0
- package/deps/rocksdb/rocksdb/util/stderr_logger.h +5 -18
- package/deps/rocksdb/rocksdb/util/timer.h +2 -3
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +9 -2
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +34 -53
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +9 -14
- package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -4
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +4 -0
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +4 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +3 -1
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +26 -8
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +114 -16
- package/deps/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +59 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +3 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +39 -0
- package/deps/rocksdb/rocksdb.gyp +0 -1
- package/index.js +6 -10
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +0 -168
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +0 -90
|
@@ -9,10 +9,9 @@
|
|
|
9
9
|
|
|
10
10
|
#pragma once
|
|
11
11
|
|
|
12
|
-
#include <sys/types.h>
|
|
13
|
-
|
|
14
12
|
#include <array>
|
|
15
13
|
#include <atomic>
|
|
14
|
+
#include <cstddef>
|
|
16
15
|
#include <cstdint>
|
|
17
16
|
#include <memory>
|
|
18
17
|
#include <string>
|
|
@@ -28,145 +27,267 @@
|
|
|
28
27
|
|
|
29
28
|
namespace ROCKSDB_NAMESPACE {
|
|
30
29
|
|
|
31
|
-
namespace
|
|
30
|
+
namespace hyper_clock_cache {
|
|
32
31
|
|
|
33
32
|
// Forward declaration of friend class.
|
|
34
33
|
class ClockCacheTest;
|
|
35
34
|
|
|
36
|
-
//
|
|
37
|
-
//
|
|
38
|
-
|
|
39
|
-
//
|
|
40
|
-
//
|
|
35
|
+
// HyperClockCache is an experimental alternative to LRUCache.
|
|
36
|
+
//
|
|
37
|
+
// Benefits
|
|
38
|
+
// --------
|
|
39
|
+
// * Fully lock free (no waits or spins) for efficiency under high concurrency
|
|
40
|
+
// * Optimized for hot path reads. For concurrency control, most Lookup() and
|
|
41
|
+
// essentially all Release() are a single atomic add operation.
|
|
42
|
+
// * Eviction on insertion is fully parallel and lock-free.
|
|
43
|
+
// * Uses a generalized + aging variant of CLOCK eviction that might outperform
|
|
44
|
+
// LRU in some cases. (For background, see
|
|
45
|
+
// https://en.wikipedia.org/wiki/Page_replacement_algorithm)
|
|
46
|
+
//
|
|
47
|
+
// Costs
|
|
48
|
+
// -----
|
|
49
|
+
// * Hash table is not resizable (for lock-free efficiency) so capacity is not
|
|
50
|
+
// dynamically changeable. Rely on an estimated average value (block) size for
|
|
51
|
+
// space+time efficiency. (See estimated_entry_charge option details.)
|
|
52
|
+
// * Insert usually does not (but might) overwrite a previous entry associated
|
|
53
|
+
// with a cache key. This is OK for RocksDB uses of Cache.
|
|
54
|
+
// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
|
|
55
|
+
// block cache (not row cache or table cache).
|
|
56
|
+
// * SecondaryCache is not supported.
|
|
57
|
+
// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
|
|
58
|
+
// transient LOW or BOTTOM priority items can evict HIGH priority entries that
|
|
59
|
+
// are not referenced recently (or often) enough.
|
|
60
|
+
// * If pinned entries leave little or nothing eligible for eviction,
|
|
61
|
+
// performance can degrade substantially, because of clock eviction eating
|
|
62
|
+
// CPU looking for evictable entries and because Release does not
|
|
63
|
+
// pro-actively delete unreferenced entries when the cache is over-full.
|
|
64
|
+
// Specifically, this makes this implementation more susceptible to the
|
|
65
|
+
// following combination:
|
|
66
|
+
// * num_shard_bits is high (e.g. 6)
|
|
67
|
+
// * capacity small (e.g. some MBs)
|
|
68
|
+
// * some large individual entries (e.g. non-partitioned filters)
|
|
69
|
+
// where individual entries occupy a large portion of their shard capacity.
|
|
70
|
+
// This should be mostly mitigated by the implementation picking a lower
|
|
71
|
+
// number of cache shards than LRUCache for a given capacity (when
|
|
72
|
+
// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()).
|
|
73
|
+
// * With strict_capacity_limit=false, respecting the capacity limit is not as
|
|
74
|
+
// aggressive as LRUCache. The limit might be transiently exceeded by a very
|
|
75
|
+
// small number of entries even when not strictly necessary, and slower to
|
|
76
|
+
// recover after pinning forces limit to be substantially exceeded. (Even with
|
|
77
|
+
// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate
|
|
78
|
+
// memory before discovering it is over the block cache capacity, so this
|
|
79
|
+
// should not be a detectable regression in respecting memory limits, except
|
|
80
|
+
// on exceptionally small caches.)
|
|
81
|
+
// * In some cases, erased or duplicated entries might not be freed
|
|
82
|
+
// immediately. They will eventually be freed by eviction from further Inserts.
|
|
83
|
+
// * Internal metadata can overflow if the number of simultaneous references
|
|
84
|
+
// to a cache handle reaches many millions.
|
|
85
|
+
//
|
|
86
|
+
// High-level eviction algorithm
|
|
87
|
+
// -----------------------------
|
|
88
|
+
// A score (or "countdown") is maintained for each entry, initially determined
|
|
89
|
+
// by priority. The score is incremented on each Lookup, up to a max of 3,
|
|
90
|
+
// though is easily returned to previous state if useful=false with Release.
|
|
91
|
+
// During CLOCK-style eviction iteration, entries with score > 0 are
|
|
92
|
+
// decremented if currently unreferenced and entries with score == 0 are
|
|
93
|
+
// evicted if currently unreferenced. Note that scoring might not be perfect
|
|
94
|
+
// because entries can be referenced transiently within the cache even when
|
|
95
|
+
// there are no outside references to the entry.
|
|
96
|
+
//
|
|
97
|
+
// Cache sharding like LRUCache is used to reduce contention on usage+eviction
|
|
98
|
+
// state, though here the performance improvement from more shards is small,
|
|
99
|
+
// and (as noted above) potentially detrimental if shard capacity is too close
|
|
100
|
+
// to largest entry size. Here cache sharding mostly only affects cache update
|
|
101
|
+
// (Insert / Erase) performance, not read performance.
|
|
102
|
+
//
|
|
103
|
+
// Read efficiency (hot path)
|
|
104
|
+
// --------------------------
|
|
105
|
+
// Mostly to minimize the cost of accessing metadata blocks with
|
|
106
|
+
// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and
|
|
107
|
+
// Release. In terms of concurrency, at a minimum, these operations have
|
|
108
|
+
// to do reference counting (and Lookup has to compare full keys in a safe
|
|
109
|
+
// way). Can we fold in all the other metadata tracking *for free* with
|
|
110
|
+
// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume
|
|
111
|
+
// for the moment that Lookup succeeds on the first probe.)
|
|
112
|
+
//
|
|
113
|
+
// We have a clever way of encoding an entry's reference count and countdown
|
|
114
|
+
// clock so that Lookup and Release are each usually a single atomic addition.
|
|
115
|
+
// In a single metadata word we have both an "acquire" count, incremented by
|
|
116
|
+
// Lookup, and a "release" count, incremented by Release. If useful=false,
|
|
117
|
+
// Release can instead decrement the acquire count. Thus the current ref
|
|
118
|
+
// count is (acquires - releases), and the countdown clock is min(3, acquires).
|
|
119
|
+
// Note that only unreferenced entries (acquires == releases) are eligible
|
|
120
|
+
// for CLOCK manipulation and eviction. We tolerate use of more expensive
|
|
121
|
+
// compare_exchange operations for cache writes (insertions and erasures).
|
|
122
|
+
//
|
|
123
|
+
// In a cache receiving many reads and little or no writes, it is possible
|
|
124
|
+
// for the acquire and release counters to overflow. Assuming the *current*
|
|
125
|
+
// refcount never reaches to many millions, we only have to correct for
|
|
126
|
+
// overflow in both counters in Release, not in Lookup. The overflow check
|
|
127
|
+
// should be only 1-2 CPU cycles per Release because it is a predictable
|
|
128
|
+
// branch on a simple condition on data already in registers.
|
|
129
|
+
//
|
|
130
|
+
// Slot states
|
|
131
|
+
// -----------
|
|
132
|
+
// We encode a state indicator into the same metadata word with the
|
|
133
|
+
// acquire and release counters. This allows bigger state transitions to
|
|
134
|
+
// be atomic. States:
|
|
41
135
|
//
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
//
|
|
45
|
-
//
|
|
46
|
-
//
|
|
47
|
-
//
|
|
48
|
-
//
|
|
49
|
-
//
|
|
136
|
+
// * Empty - slot is not in use and unowned. All other metadata and data is
|
|
137
|
+
// in an undefined state.
|
|
138
|
+
// * Construction - slot is exclusively owned by one thread, the thread
|
|
139
|
+
// successfully entering this state, for populating or freeing data.
|
|
140
|
+
// * Shareable (group) - slot holds an entry with counted references for
|
|
141
|
+
// pinning and reading, including
|
|
142
|
+
// * Visible - slot holds an entry that can be returned by Lookup
|
|
143
|
+
// * Invisible - slot holds an entry that is not visible to Lookup
|
|
144
|
+
// (erased by user) but can be read by existing references, and ref count
|
|
145
|
+
// changed by Ref and Release.
|
|
50
146
|
//
|
|
147
|
+
// A special case is "detached" entries, which are heap-allocated handles
|
|
148
|
+
// not in the table. They are always Invisible and freed on zero refs.
|
|
51
149
|
//
|
|
52
|
-
//
|
|
150
|
+
// State transitions:
|
|
151
|
+
// Empty -> Construction (in Insert): The encoding of state enables Insert to
|
|
152
|
+
// perform an optimistic atomic bitwise-or to take ownership if a slot is
|
|
153
|
+
// empty, or otherwise make no state change.
|
|
53
154
|
//
|
|
54
|
-
//
|
|
55
|
-
//
|
|
56
|
-
//
|
|
57
|
-
// the users. ClockHandles have two members to support external references:
|
|
58
|
-
// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0,
|
|
59
|
-
// the handle is externally referenced. Updates that intend to modify the
|
|
60
|
-
// handle will refrain from doing so. Eventually, when all references are
|
|
61
|
-
// released, we have EXTERNAL_REFS == 0, and updates can operate normally on
|
|
62
|
-
// the handle.
|
|
63
|
-
// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation
|
|
64
|
-
// decides the handle should be deleted. This happens either when the last
|
|
65
|
-
// reference to a handle is released (and the release operation is instructed
|
|
66
|
-
// to delete on last reference) or on when a delete operation is called on
|
|
67
|
-
// the item. This flag is needed because an externally referenced handle
|
|
68
|
-
// can't be immediately deleted. In these cases, the flag will be later read
|
|
69
|
-
// and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is
|
|
70
|
-
// used not only to defer deletions, but also as a barrier for external
|
|
71
|
-
// references: once WILL_BE_DELETED is set, lookups (which are the most
|
|
72
|
-
// common way to acquire new external references) will ignore the handle.
|
|
73
|
-
// For this reason, when WILL_BE_DELETED is set, we say the handle is
|
|
74
|
-
// invisible (and, otherwise, that it's visible).
|
|
155
|
+
// Construction -> Visible (in Insert): This can be a simple assignment to the
|
|
156
|
+
// metadata word because the current thread has exclusive ownership and other
|
|
157
|
+
// metadata is meaningless.
|
|
75
158
|
//
|
|
159
|
+
// Visible -> Invisible (in Erase): This can be a bitwise-and while holding
|
|
160
|
+
// a shared reference, which is safe because the change is idempotent (in case
|
|
161
|
+
// of parallel Erase). By the way, we never go Invisible->Visible.
|
|
76
162
|
//
|
|
77
|
-
//
|
|
163
|
+
// Shareable -> Construction (in Evict part of Insert, in Erase, and in
|
|
164
|
+
// Release if Invisible): This is for starting to freeing/deleting an
|
|
165
|
+
// unreferenced entry. We have to use compare_exchange to ensure we only make
|
|
166
|
+
// this transition when there are zero refs.
|
|
78
167
|
//
|
|
79
|
-
//
|
|
80
|
-
//
|
|
81
|
-
//
|
|
82
|
-
//
|
|
83
|
-
// (although the code can be easily modified to use other probing schemes, like
|
|
84
|
-
// linear probing).
|
|
168
|
+
// Construction -> Empty (in same places): This is for completing free/delete
|
|
169
|
+
// of an entry. A "release" atomic store suffices, as we have exclusive
|
|
170
|
+
// ownership of the slot but have to ensure none of the data member reads are
|
|
171
|
+
// re-ordered after committing the state transition.
|
|
85
172
|
//
|
|
86
|
-
//
|
|
87
|
-
//
|
|
88
|
-
//
|
|
89
|
-
//
|
|
90
|
-
//
|
|
91
|
-
//
|
|
92
|
-
//
|
|
93
|
-
//
|
|
94
|
-
//
|
|
173
|
+
// Insert
|
|
174
|
+
// ------
|
|
175
|
+
// If Insert were to guarantee replacing an existing entry for a key, there
|
|
176
|
+
// would be complications for concurrency and efficiency. First, consider how
|
|
177
|
+
// many probes to get to an entry. To ensure Lookup never waits and
|
|
178
|
+
// availability of a key is uninterrupted, we would need to use a different
|
|
179
|
+
// slot for a new entry for the same key. This means it is most likely in a
|
|
180
|
+
// later probing position than the old version, which should soon be removed.
|
|
181
|
+
// (Also, an entry is too big to replace atomically, even if no current refs.)
|
|
95
182
|
//
|
|
96
|
-
//
|
|
97
|
-
//
|
|
98
|
-
//
|
|
99
|
-
// empty or a tombstone. As soon as a KV pair is written into the slot, it
|
|
100
|
-
// becomes a visible element. At some point, the handle will be deleted
|
|
101
|
-
// by an explicit delete operation, the eviction algorithm, or an overwriting
|
|
102
|
-
// insert. In either case, the handle is marked for deletion. When the an
|
|
103
|
-
// attempt to delete the element finally succeeds, the slot is freed up
|
|
104
|
-
// and becomes available again.
|
|
183
|
+
// However, overwrite capability is not really needed by RocksDB. Also, we
|
|
184
|
+
// know from our "redundant" stats that overwrites are very rare for the block
|
|
185
|
+
// cache, so we should not spend much to make them effective.
|
|
105
186
|
//
|
|
187
|
+
// So instead we Insert as soon as we find an empty slot in the probing
|
|
188
|
+
// sequence without seeing an existing (visible) entry for the same key. This
|
|
189
|
+
// way we only insert if we can improve the probing performance, and we don't
|
|
190
|
+
// need to probe beyond our insert position, assuming we are willing to let
|
|
191
|
+
// the previous entry for the same key die of old age (eventual eviction from
|
|
192
|
+
// not being used). We can reach a similar state with concurrent insertions,
|
|
193
|
+
// where one will pass over the other while it is "under construction."
|
|
194
|
+
// This temporary duplication is acceptable for RocksDB block cache because
|
|
195
|
+
// we know redundant insertion is rare.
|
|
106
196
|
//
|
|
107
|
-
//
|
|
197
|
+
// Another problem to solve is what to return to the caller when we find an
|
|
198
|
+
// existing entry whose probing position we cannot improve on, or when the
|
|
199
|
+
// table occupancy limit has been reached. If strict_capacity_limit=false,
|
|
200
|
+
// we must never fail Insert, and if a Handle* is provided, we have to return
|
|
201
|
+
// a usable Cache handle on success. The solution to this (typically rare)
|
|
202
|
+
// problem is "detached" handles, which are usable by the caller but not
|
|
203
|
+
// actually available for Lookup in the Cache. Detached handles are allocated
|
|
204
|
+
// independently on the heap and specially marked so that they are freed on
|
|
205
|
+
// the heap when their last reference is released.
|
|
108
206
|
//
|
|
109
|
-
//
|
|
110
|
-
//
|
|
111
|
-
//
|
|
112
|
-
//
|
|
113
|
-
//
|
|
114
|
-
//
|
|
115
|
-
//
|
|
207
|
+
// Usage on capacity
|
|
208
|
+
// -----------------
|
|
209
|
+
// Insert takes different approaches to usage tracking depending on
|
|
210
|
+
// strict_capacity_limit setting. If true, we enforce a kind of strong
|
|
211
|
+
// consistency where compare-exchange is used to ensure the usage number never
|
|
212
|
+
// exceeds its limit, and provide threads with an authoritative signal on how
|
|
213
|
+
// much "usage" they have taken ownership of. With strict_capacity_limit=false,
|
|
214
|
+
// we use a kind of "eventual consistency" where all threads Inserting to the
|
|
215
|
+
// same cache shard might race on reserving the same space, but the
|
|
216
|
+
// over-commitment will be worked out in later insertions. It is kind of a
|
|
217
|
+
// dance because we don't want threads racing each other too much on paying
|
|
218
|
+
// down the over-commitment (with eviction) either.
|
|
116
219
|
//
|
|
117
|
-
//
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
//
|
|
121
|
-
//
|
|
122
|
-
//
|
|
220
|
+
// Eviction
|
|
221
|
+
// --------
|
|
222
|
+
// A key part of Insert is evicting some entries currently unreferenced to
|
|
223
|
+
// make room for new entries. The high-level eviction algorithm is described
|
|
224
|
+
// above, but the details are also interesting. A key part is parallelizing
|
|
225
|
+
// eviction with a single CLOCK pointer. This works by each thread working on
|
|
226
|
+
// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK-
|
|
227
|
+
// updating or evicting the incremented-over slot(s). To reduce contention at
|
|
228
|
+
// the cost of possibly evicting too much, each thread increments the clock
|
|
229
|
+
// pointer by 4, so commits to updating at least 4 slots per batch. As
|
|
230
|
+
// described above, a CLOCK update will decrement the "countdown" of
|
|
231
|
+
// unreferenced entries, or evict unreferenced entries with zero countdown.
|
|
232
|
+
// Referenced entries are not updated, because we (presumably) don't want
|
|
233
|
+
// long-referenced entries to age while referenced. Note however that we
|
|
234
|
+
// cannot distinguish transiently referenced entries from cache user
|
|
235
|
+
// references, so some CLOCK updates might be somewhat arbitrarily skipped.
|
|
236
|
+
// This is OK as long as it is rare enough that eviction order is still
|
|
237
|
+
// pretty good.
|
|
123
238
|
//
|
|
124
|
-
//
|
|
125
|
-
//
|
|
126
|
-
//
|
|
127
|
-
//
|
|
128
|
-
//
|
|
129
|
-
//
|
|
130
|
-
//
|
|
131
|
-
//
|
|
132
|
-
//
|
|
133
|
-
// reference avoids the deadlock, but then the handle may change inbetween.
|
|
134
|
-
// One of the key observations we use in our implementation is that we can
|
|
135
|
-
// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED.
|
|
239
|
+
// There is no synchronization on the completion of the CLOCK updates, so it
|
|
240
|
+
// is theoretically possible for another thread to cycle back around and have
|
|
241
|
+
// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely
|
|
242
|
+
// on any implied exclusivity to make the updates or eviction more efficient.
|
|
243
|
+
// These updates use an opportunistic compare-exchange (no loop), where a
|
|
244
|
+
// racing thread might cause the update to be skipped without retry, but in
|
|
245
|
+
// such case the update is likely not needed because the most likely update
|
|
246
|
+
// to an entry is that it has become referenced. (TODO: test efficiency of
|
|
247
|
+
// avoiding compare-exchange loop)
|
|
136
248
|
//
|
|
137
|
-
//
|
|
138
|
-
//
|
|
139
|
-
//
|
|
140
|
-
//
|
|
141
|
-
//
|
|
142
|
-
// - We can precisely determine when there are no more external references to a
|
|
143
|
-
// handle, and proceed to mark it for deletion. This is useful when users
|
|
144
|
-
// release external references.
|
|
249
|
+
// Release
|
|
250
|
+
// -------
|
|
251
|
+
// In the common case, Release is a simple atomic increment of the release
|
|
252
|
+
// counter. There is a simple overflow check that only does another atomic
|
|
253
|
+
// update in extremely rare cases, so costs almost nothing.
|
|
145
254
|
//
|
|
255
|
+
// If the Release specifies "not useful", we can instead decrement the
|
|
256
|
+
// acquire counter, which returns to the same CLOCK state as before Lookup
|
|
257
|
+
// or Ref.
|
|
146
258
|
//
|
|
147
|
-
//
|
|
259
|
+
// Adding a check for over-full cache on every release to zero-refs would
|
|
260
|
+
// likely be somewhat expensive, increasing read contention on cache shard
|
|
261
|
+
// metadata. Instead we are less aggressive about deleting entries right
|
|
262
|
+
// away in those cases.
|
|
148
263
|
//
|
|
149
|
-
//
|
|
150
|
-
//
|
|
151
|
-
//
|
|
152
|
-
//
|
|
153
|
-
//
|
|
154
|
-
// the slot
|
|
155
|
-
//
|
|
156
|
-
//
|
|
157
|
-
//
|
|
158
|
-
//
|
|
159
|
-
//
|
|
160
|
-
//
|
|
161
|
-
//
|
|
264
|
+
// However Release tries to immediately delete entries reaching zero refs
|
|
265
|
+
// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already
|
|
266
|
+
// marked invisible. Both of these are checks on values already in CPU
|
|
267
|
+
// registers so do not increase cross-CPU contention when not applicable.
|
|
268
|
+
// When applicable, they use a compare-exchange loop to take exclusive
|
|
269
|
+
// ownership of the slot for freeing the entry. These are rare cases
|
|
270
|
+
// that should not usually affect performance.
|
|
271
|
+
//
|
|
272
|
+
// Erase
|
|
273
|
+
// -----
|
|
274
|
+
// Searches for an entry like Lookup but moves it to Invisible state if found.
|
|
275
|
+
// This state transition is with bit operations so is idempotent and safely
|
|
276
|
+
// done while only holding a shared "read" reference. Like Release, it makes
|
|
277
|
+
// a best effort to immediately release an Invisible entry that reaches zero
|
|
278
|
+
// refs, but there are some corner cases where it will only be freed by the
|
|
279
|
+
// clock eviction process.
|
|
280
|
+
|
|
281
|
+
// ----------------------------------------------------------------------- //
|
|
162
282
|
|
|
163
283
|
// The load factor p is a real number in (0, 1) such that at all
|
|
164
284
|
// times at most a fraction p of all slots, without counting tombstones,
|
|
165
|
-
// are occupied by elements. This means that the probability that a
|
|
166
|
-
//
|
|
285
|
+
// are occupied by elements. This means that the probability that a random
|
|
286
|
+
// probe hits an occupied slot is at most p, and thus at most 1/p probes
|
|
167
287
|
// are required on average. For example, p = 70% implies that between 1 and 2
|
|
168
288
|
// probes are needed on average (bear in mind that this reasoning doesn't
|
|
169
|
-
// consider the effects of clustering over time
|
|
289
|
+
// consider the effects of clustering over time, which should be negligible
|
|
290
|
+
// with double hashing).
|
|
170
291
|
// Because the size of the hash table is always rounded up to the next
|
|
171
292
|
// power of 2, p is really an upper bound on the actual load factor---the
|
|
172
293
|
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
|
|
@@ -174,440 +295,119 @@ class ClockCacheTest;
|
|
|
174
295
|
// Since space cost is dominated by the values (the LSM blocks),
|
|
175
296
|
// overprovisioning the table with metadata only increases the total cache space
|
|
176
297
|
// usage by a tiny fraction.
|
|
177
|
-
constexpr double kLoadFactor = 0.
|
|
298
|
+
constexpr double kLoadFactor = 0.7;
|
|
178
299
|
|
|
179
300
|
// The user can exceed kLoadFactor if the sizes of the inserted values don't
|
|
180
|
-
// match estimated_value_size, or
|
|
181
|
-
// avoid
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
// Maximum number of spins when trying to acquire a ref.
|
|
185
|
-
// TODO(Guido) This value was set arbitrarily. Is it appropriate?
|
|
186
|
-
// What's the best way to bound the spinning?
|
|
187
|
-
constexpr uint32_t kSpinsPerTry = 100000;
|
|
188
|
-
|
|
189
|
-
// Arbitrary seeds.
|
|
190
|
-
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
|
|
191
|
-
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
|
|
192
|
-
|
|
193
|
-
struct ClockHandle {
|
|
194
|
-
void* value;
|
|
195
|
-
Cache::DeleterFn deleter;
|
|
196
|
-
uint32_t hash;
|
|
197
|
-
size_t total_charge;
|
|
198
|
-
std::array<char, kCacheKeySize> key_data;
|
|
199
|
-
|
|
200
|
-
static constexpr uint8_t kIsElementOffset = 0;
|
|
201
|
-
static constexpr uint8_t kClockPriorityOffset = 1;
|
|
202
|
-
static constexpr uint8_t kIsHitOffset = 3;
|
|
203
|
-
static constexpr uint8_t kCachePriorityOffset = 4;
|
|
204
|
-
|
|
205
|
-
enum Flags : uint8_t {
|
|
206
|
-
// Whether the slot is in use by an element.
|
|
207
|
-
IS_ELEMENT = 1 << kIsElementOffset,
|
|
208
|
-
// Clock priorities. Represents how close a handle is from being evictable.
|
|
209
|
-
CLOCK_PRIORITY = 3 << kClockPriorityOffset,
|
|
210
|
-
// Whether the handle has been looked up after its insertion.
|
|
211
|
-
HAS_HIT = 1 << kIsHitOffset,
|
|
212
|
-
// The value of Cache::Priority of the handle.
|
|
213
|
-
CACHE_PRIORITY = 1 << kCachePriorityOffset,
|
|
214
|
-
};
|
|
215
|
-
|
|
216
|
-
std::atomic<uint8_t> flags;
|
|
217
|
-
|
|
218
|
-
enum ClockPriority : uint8_t {
|
|
219
|
-
NONE = (0 << kClockPriorityOffset),
|
|
220
|
-
LOW = (1 << kClockPriorityOffset),
|
|
221
|
-
MEDIUM = (2 << kClockPriorityOffset),
|
|
222
|
-
HIGH = (3 << kClockPriorityOffset)
|
|
223
|
-
};
|
|
224
|
-
|
|
225
|
-
// The number of elements that hash to this slot or a lower one, but wind
|
|
226
|
-
// up in this slot or a higher one.
|
|
227
|
-
std::atomic<uint32_t> displacements;
|
|
228
|
-
|
|
229
|
-
static constexpr uint8_t kExternalRefsOffset = 0;
|
|
230
|
-
static constexpr uint8_t kSharedRefsOffset = 15;
|
|
231
|
-
static constexpr uint8_t kExclusiveRefOffset = 30;
|
|
232
|
-
static constexpr uint8_t kWillBeDeletedOffset = 31;
|
|
233
|
-
|
|
234
|
-
enum Refs : uint32_t {
|
|
235
|
-
// Synchronization model:
|
|
236
|
-
// - An external reference guarantees that hash, value, key_data
|
|
237
|
-
// and the IS_ELEMENT flag are not modified. Doesn't allow
|
|
238
|
-
// any writes.
|
|
239
|
-
// - An internal reference has the same guarantees as an
|
|
240
|
-
// external reference, and additionally allows the following
|
|
241
|
-
// idempotent updates on the handle:
|
|
242
|
-
// * set CLOCK_PRIORITY to NONE;
|
|
243
|
-
// * set the HAS_HIT bit;
|
|
244
|
-
// * set the WILL_BE_DELETED bit.
|
|
245
|
-
// - A shared reference is either an external reference or an
|
|
246
|
-
// internal reference.
|
|
247
|
-
// - An exclusive reference guarantees that no other thread has a shared
|
|
248
|
-
// or exclusive reference to the handle, and allows writes
|
|
249
|
-
// on the handle.
|
|
250
|
-
|
|
251
|
-
// Number of external references to the slot.
|
|
252
|
-
EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
|
|
253
|
-
<< kExternalRefsOffset, // Bits 0, ..., 14
|
|
254
|
-
// Number of internal references plus external references to the slot.
|
|
255
|
-
SHARED_REFS = ((uint32_t{1} << 15) - 1)
|
|
256
|
-
<< kSharedRefsOffset, // Bits 15, ..., 29
|
|
257
|
-
// Whether a thread has an exclusive reference to the slot.
|
|
258
|
-
EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
|
|
259
|
-
// Whether the handle will be deleted soon. When this bit is set, new
|
|
260
|
-
// internal references to this handle stop being accepted.
|
|
261
|
-
// External references may still be granted---they can be created from
|
|
262
|
-
// existing external references, or converting from existing internal
|
|
263
|
-
// references.
|
|
264
|
-
WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
|
|
265
|
-
|
|
266
|
-
// Having these 4 fields in a single variable allows us to support the
|
|
267
|
-
// following operations efficiently:
|
|
268
|
-
// - Convert an internal reference into an external reference in a single
|
|
269
|
-
// atomic arithmetic operation.
|
|
270
|
-
// - Attempt to take a shared reference using a single atomic arithmetic
|
|
271
|
-
// operation. This is because we can increment the internal ref count
|
|
272
|
-
// as well as checking whether the entry is marked for deletion using a
|
|
273
|
-
// single atomic arithmetic operation (and one non-atomic comparison).
|
|
274
|
-
};
|
|
275
|
-
|
|
276
|
-
static constexpr uint32_t kOneInternalRef = 0x8000;
|
|
277
|
-
static constexpr uint32_t kOneExternalRef = 0x8001;
|
|
278
|
-
|
|
279
|
-
std::atomic<uint32_t> refs;
|
|
301
|
+
// match estimated_value_size, or in some rare cases with
|
|
302
|
+
// strict_capacity_limit == false. To avoid degenerate performance, we set a
|
|
303
|
+
// strict upper bound on the load factor.
|
|
304
|
+
constexpr double kStrictLoadFactor = 0.84;
|
|
280
305
|
|
|
281
|
-
|
|
282
|
-
bool detached;
|
|
283
|
-
|
|
284
|
-
ClockHandle()
|
|
285
|
-
: value(nullptr),
|
|
286
|
-
deleter(nullptr),
|
|
287
|
-
hash(0),
|
|
288
|
-
total_charge(0),
|
|
289
|
-
flags(0),
|
|
290
|
-
displacements(0),
|
|
291
|
-
refs(0),
|
|
292
|
-
detached(false) {
|
|
293
|
-
SetWillBeDeleted(false);
|
|
294
|
-
SetIsElement(false);
|
|
295
|
-
SetClockPriority(ClockPriority::NONE);
|
|
296
|
-
SetCachePriority(Cache::Priority::LOW);
|
|
297
|
-
key_data.fill(0);
|
|
298
|
-
}
|
|
306
|
+
using CacheKeyBytes = std::array<char, kCacheKeySize>;
|
|
299
307
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
ClockHandle(const ClockHandle& other) { *this = other; }
|
|
306
|
-
|
|
307
|
-
void operator=(const ClockHandle& other) {
|
|
308
|
-
value = other.value;
|
|
309
|
-
deleter = other.deleter;
|
|
310
|
-
key_data = other.key_data;
|
|
311
|
-
hash = other.hash;
|
|
312
|
-
total_charge = other.total_charge;
|
|
313
|
-
}
|
|
308
|
+
struct ClockHandleBasicData {
|
|
309
|
+
void* value = nullptr;
|
|
310
|
+
Cache::DeleterFn deleter = nullptr;
|
|
311
|
+
CacheKeyBytes key = {};
|
|
312
|
+
size_t total_charge = 0;
|
|
314
313
|
|
|
315
|
-
Slice
|
|
314
|
+
Slice KeySlice() const { return Slice(key.data(), kCacheKeySize); }
|
|
316
315
|
|
|
317
|
-
void FreeData() {
|
|
316
|
+
void FreeData() const {
|
|
318
317
|
if (deleter) {
|
|
319
|
-
(*deleter)(
|
|
318
|
+
(*deleter)(KeySlice(), value);
|
|
320
319
|
}
|
|
321
320
|
}
|
|
321
|
+
};
|
|
322
|
+
|
|
323
|
+
struct ClockHandleMoreData : public ClockHandleBasicData {
|
|
324
|
+
uint32_t hash = 0;
|
|
325
|
+
};
|
|
326
|
+
|
|
327
|
+
// Target size to be exactly a common cache line size (see static_assert in
|
|
328
|
+
// clock_cache.cc)
|
|
329
|
+
struct ALIGN_AS(64U) ClockHandle : public ClockHandleMoreData {
|
|
330
|
+
// Constants for handling the atomic `meta` word, which tracks most of the
|
|
331
|
+
// state of the handle. The meta word looks like this:
|
|
332
|
+
// low bits high bits
|
|
333
|
+
// -----------------------------------------------------------------------
|
|
334
|
+
// | acquire counter | release counter | state marker |
|
|
335
|
+
// -----------------------------------------------------------------------
|
|
336
|
+
|
|
337
|
+
// For reading or updating counters in meta word.
|
|
338
|
+
static constexpr uint8_t kCounterNumBits = 30;
|
|
339
|
+
static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
|
|
340
|
+
|
|
341
|
+
static constexpr uint8_t kAcquireCounterShift = 0;
|
|
342
|
+
static constexpr uint64_t kAcquireIncrement = uint64_t{1}
|
|
343
|
+
<< kAcquireCounterShift;
|
|
344
|
+
static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
|
|
345
|
+
static constexpr uint64_t kReleaseIncrement = uint64_t{1}
|
|
346
|
+
<< kReleaseCounterShift;
|
|
347
|
+
|
|
348
|
+
// For reading or updating the state marker in meta word
|
|
349
|
+
static constexpr uint8_t kStateShift = 2U * kCounterNumBits;
|
|
350
|
+
|
|
351
|
+
// Bits contribution to state marker.
|
|
352
|
+
// Occupied means any state other than empty
|
|
353
|
+
static constexpr uint8_t kStateOccupiedBit = 0b100;
|
|
354
|
+
// Shareable means the entry is reference counted (visible or invisible)
|
|
355
|
+
// (only set if also occupied)
|
|
356
|
+
static constexpr uint8_t kStateShareableBit = 0b010;
|
|
357
|
+
// Visible is only set if also shareable
|
|
358
|
+
static constexpr uint8_t kStateVisibleBit = 0b001;
|
|
359
|
+
|
|
360
|
+
// Complete state markers (not shifted into full word)
|
|
361
|
+
static constexpr uint8_t kStateEmpty = 0b000;
|
|
362
|
+
static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
|
|
363
|
+
static constexpr uint8_t kStateInvisible =
|
|
364
|
+
kStateOccupiedBit | kStateShareableBit;
|
|
365
|
+
static constexpr uint8_t kStateVisible =
|
|
366
|
+
kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
|
|
367
|
+
|
|
368
|
+
// Constants for initializing the countdown clock. (Countdown clock is only
|
|
369
|
+
// in effect with zero refs, acquire counter == release counter, and in that
|
|
370
|
+
// case the countdown clock == both of those counters.)
|
|
371
|
+
static constexpr uint8_t kHighCountdown = 3;
|
|
372
|
+
static constexpr uint8_t kLowCountdown = 2;
|
|
373
|
+
static constexpr uint8_t kBottomCountdown = 1;
|
|
374
|
+
// During clock update, treat any countdown clock value greater than this
|
|
375
|
+
// value the same as this value.
|
|
376
|
+
static constexpr uint8_t kMaxCountdown = kHighCountdown;
|
|
377
|
+
// TODO: make these coundown values tuning parameters for eviction?
|
|
378
|
+
|
|
379
|
+
// See above
|
|
380
|
+
std::atomic<uint64_t> meta{};
|
|
381
|
+
// The number of elements that hash to this slot or a lower one, but wind
|
|
382
|
+
// up in this slot or a higher one.
|
|
383
|
+
std::atomic<uint32_t> displacements{};
|
|
322
384
|
|
|
323
|
-
//
|
|
324
|
-
|
|
325
|
-
CacheMetadataChargePolicy metadata_charge_policy) const {
|
|
326
|
-
if (metadata_charge_policy != kFullChargeCacheMetadata) {
|
|
327
|
-
return 0;
|
|
328
|
-
} else {
|
|
329
|
-
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
|
|
330
|
-
// return malloc_usable_size(
|
|
331
|
-
// const_cast<void*>(static_cast<const void*>(this)));
|
|
332
|
-
// #else
|
|
333
|
-
// TODO(Guido) malloc_usable_size only works when we call it on
|
|
334
|
-
// a pointer allocated with malloc. Because our handles are all
|
|
335
|
-
// allocated in a single shot as an array, the user can't call
|
|
336
|
-
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
|
|
337
|
-
// pointer returned by the cache. Moreover, malloc_usable_size
|
|
338
|
-
// expects a heap-allocated handle, but sometimes in our code we
|
|
339
|
-
// wish to pass a stack-allocated handle (this is only a performance
|
|
340
|
-
// concern).
|
|
341
|
-
// What is the right way to compute metadata charges with pre-allocated
|
|
342
|
-
// handles?
|
|
343
|
-
return sizeof(ClockHandle);
|
|
344
|
-
// #endif
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
inline void CalcTotalCharge(
|
|
349
|
-
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
|
|
350
|
-
total_charge = charge + CalcMetaCharge(metadata_charge_policy);
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
inline size_t GetCharge(
|
|
354
|
-
CacheMetadataChargePolicy metadata_charge_policy) const {
|
|
355
|
-
size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
|
|
356
|
-
assert(total_charge >= meta_charge);
|
|
357
|
-
return total_charge - meta_charge;
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
// flags functions.
|
|
361
|
-
|
|
362
|
-
bool IsElement() const { return flags & Flags::IS_ELEMENT; }
|
|
363
|
-
|
|
364
|
-
void SetIsElement(bool is_element) {
|
|
365
|
-
if (is_element) {
|
|
366
|
-
flags |= Flags::IS_ELEMENT;
|
|
367
|
-
} else {
|
|
368
|
-
flags &= static_cast<uint8_t>(~Flags::IS_ELEMENT);
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
bool HasHit() const { return flags & HAS_HIT; }
|
|
373
|
-
|
|
374
|
-
void SetHit() { flags |= HAS_HIT; }
|
|
375
|
-
|
|
376
|
-
Cache::Priority GetCachePriority() const {
|
|
377
|
-
return static_cast<Cache::Priority>(flags & CACHE_PRIORITY);
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
void SetCachePriority(Cache::Priority priority) {
|
|
381
|
-
if (priority == Cache::Priority::HIGH) {
|
|
382
|
-
flags |= Flags::CACHE_PRIORITY;
|
|
383
|
-
} else {
|
|
384
|
-
flags &= static_cast<uint8_t>(~Flags::CACHE_PRIORITY);
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
bool IsInClock() const {
|
|
389
|
-
return GetClockPriority() != ClockHandle::ClockPriority::NONE;
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
ClockPriority GetClockPriority() const {
|
|
393
|
-
return static_cast<ClockPriority>(flags & Flags::CLOCK_PRIORITY);
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
void SetClockPriority(ClockPriority priority) {
|
|
397
|
-
flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
|
|
398
|
-
flags |= priority;
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
void DecreaseClockPriority() {
|
|
402
|
-
uint8_t p = static_cast<uint8_t>(flags & Flags::CLOCK_PRIORITY) >>
|
|
403
|
-
kClockPriorityOffset;
|
|
404
|
-
assert(p > 0);
|
|
405
|
-
p--;
|
|
406
|
-
flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
|
|
407
|
-
ClockPriority new_priority =
|
|
408
|
-
static_cast<ClockPriority>(p << kClockPriorityOffset);
|
|
409
|
-
flags |= new_priority;
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
bool IsDetached() { return detached; }
|
|
413
|
-
|
|
414
|
-
void SetDetached() { detached = true; }
|
|
415
|
-
|
|
416
|
-
inline bool IsEmpty() const {
|
|
417
|
-
return !this->IsElement() && this->displacements == 0;
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
inline bool IsTombstone() const {
|
|
421
|
-
return !this->IsElement() && this->displacements > 0;
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
|
|
425
|
-
return this->hash == some_hash && this->key() == some_key;
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
// refs functions.
|
|
429
|
-
|
|
430
|
-
inline bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
|
|
431
|
-
|
|
432
|
-
void SetWillBeDeleted(bool will_be_deleted) {
|
|
433
|
-
if (will_be_deleted) {
|
|
434
|
-
refs |= WILL_BE_DELETED;
|
|
435
|
-
} else {
|
|
436
|
-
refs &= ~WILL_BE_DELETED;
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
uint32_t ExternalRefs() const {
|
|
441
|
-
return (refs & EXTERNAL_REFS) >> kExternalRefsOffset;
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
// Tries to take an internal ref. Returns true iff it succeeds.
|
|
445
|
-
inline bool TryInternalRef() {
|
|
446
|
-
if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
|
|
447
|
-
return true;
|
|
448
|
-
}
|
|
449
|
-
refs -= kOneInternalRef;
|
|
450
|
-
return false;
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
// Tries to take an external ref. Returns true iff it succeeds.
|
|
454
|
-
inline bool TryExternalRef() {
|
|
455
|
-
if (!((refs += kOneExternalRef) & EXCLUSIVE_REF)) {
|
|
456
|
-
return true;
|
|
457
|
-
}
|
|
458
|
-
refs -= kOneExternalRef;
|
|
459
|
-
return false;
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
// Tries to take an exclusive ref. Returns true iff it succeeds.
|
|
463
|
-
// TODO(Guido) After every TryExclusiveRef call, we always call
|
|
464
|
-
// WillBeDeleted(). We could save an atomic read by having an output parameter
|
|
465
|
-
// with the last value of refs.
|
|
466
|
-
inline bool TryExclusiveRef() {
|
|
467
|
-
uint32_t will_be_deleted = refs & WILL_BE_DELETED;
|
|
468
|
-
uint32_t expected = will_be_deleted;
|
|
469
|
-
return refs.compare_exchange_strong(expected,
|
|
470
|
-
EXCLUSIVE_REF | will_be_deleted);
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
// Repeatedly tries to take an exclusive reference, but aborts as soon
|
|
474
|
-
// as an external or exclusive reference is detected (since the wait
|
|
475
|
-
// would presumably be too long).
|
|
476
|
-
inline bool SpinTryExclusiveRef() {
|
|
477
|
-
uint32_t expected = 0;
|
|
478
|
-
uint32_t will_be_deleted = 0;
|
|
479
|
-
uint32_t spins = kSpinsPerTry;
|
|
480
|
-
while (!refs.compare_exchange_strong(expected,
|
|
481
|
-
EXCLUSIVE_REF | will_be_deleted) &&
|
|
482
|
-
spins--) {
|
|
483
|
-
std::this_thread::yield();
|
|
484
|
-
if (expected & (EXTERNAL_REFS | EXCLUSIVE_REF)) {
|
|
485
|
-
return false;
|
|
486
|
-
}
|
|
487
|
-
will_be_deleted = expected & WILL_BE_DELETED;
|
|
488
|
-
expected = will_be_deleted;
|
|
489
|
-
}
|
|
490
|
-
return true;
|
|
491
|
-
}
|
|
492
|
-
|
|
493
|
-
// Take an external ref, assuming there is already one external ref
|
|
494
|
-
// to the handle.
|
|
495
|
-
void Ref() {
|
|
496
|
-
// TODO(Guido) Is it okay to assume that the existing external reference
|
|
497
|
-
// survives until this function returns?
|
|
498
|
-
refs += kOneExternalRef;
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
inline void ReleaseExternalRef() { refs -= kOneExternalRef; }
|
|
502
|
-
|
|
503
|
-
inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
|
|
504
|
-
|
|
505
|
-
inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
|
|
506
|
-
|
|
507
|
-
// Downgrade an exclusive ref to external.
|
|
508
|
-
inline void ExclusiveToExternalRef() {
|
|
509
|
-
refs += kOneExternalRef;
|
|
510
|
-
ReleaseExclusiveRef();
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
// Convert an internal ref into external.
|
|
514
|
-
inline void InternalToExternalRef() {
|
|
515
|
-
refs += kOneExternalRef - kOneInternalRef;
|
|
516
|
-
}
|
|
517
|
-
|
|
385
|
+
// True iff the handle is allocated separately from hash table.
|
|
386
|
+
bool detached = false;
|
|
518
387
|
}; // struct ClockHandle
|
|
519
388
|
|
|
520
389
|
class ClockHandleTable {
|
|
521
390
|
public:
|
|
522
|
-
explicit ClockHandleTable(
|
|
391
|
+
explicit ClockHandleTable(int hash_bits, bool initial_charge_metadata);
|
|
523
392
|
~ClockHandleTable();
|
|
524
393
|
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
ClockHandle* Lookup(const Slice& key, uint32_t hash);
|
|
529
|
-
|
|
530
|
-
// Inserts a copy of h into the hash table. Returns a pointer to the
|
|
531
|
-
// inserted handle, or nullptr if no available slot was found. Every
|
|
532
|
-
// existing visible handle matching the key is already present in the
|
|
533
|
-
// hash table is marked as WILL_BE_DELETED. The deletion is also attempted,
|
|
534
|
-
// and, if the attempt is successful, the handle is inserted into the
|
|
535
|
-
// autovector deleted. When take_reference is true, the function hands
|
|
536
|
-
// over an external reference on the handle, and otherwise no reference is
|
|
537
|
-
// produced.
|
|
538
|
-
ClockHandle* Insert(ClockHandle* h, autovector<ClockHandle>* deleted,
|
|
539
|
-
bool take_reference);
|
|
540
|
-
|
|
541
|
-
// Assigns h the appropriate clock priority, making it evictable.
|
|
542
|
-
void ClockOn(ClockHandle* h);
|
|
543
|
-
|
|
544
|
-
// Makes h non-evictable.
|
|
545
|
-
void ClockOff(ClockHandle* h);
|
|
546
|
-
|
|
547
|
-
// Runs the clock eviction algorithm until usage_ + charge is at most
|
|
548
|
-
// capacity_.
|
|
549
|
-
void ClockRun(size_t charge);
|
|
550
|
-
|
|
551
|
-
// Remove h from the hash table. Requires an exclusive ref to h.
|
|
552
|
-
void Remove(ClockHandle* h, autovector<ClockHandle>* deleted);
|
|
553
|
-
|
|
554
|
-
// Remove from the hash table all handles with matching key/hash along a
|
|
555
|
-
// probe sequence, starting from the given probe number. Doesn't
|
|
556
|
-
// require any references.
|
|
557
|
-
void RemoveAll(const Slice& key, uint32_t hash, uint32_t& probe,
|
|
558
|
-
autovector<ClockHandle>* deleted);
|
|
559
|
-
|
|
560
|
-
void RemoveAll(const Slice& key, uint32_t hash,
|
|
561
|
-
autovector<ClockHandle>* deleted) {
|
|
562
|
-
uint32_t probe = 0;
|
|
563
|
-
RemoveAll(key, hash, probe, deleted);
|
|
564
|
-
}
|
|
394
|
+
Status Insert(const ClockHandleMoreData& proto, ClockHandle** handle,
|
|
395
|
+
Cache::Priority priority, size_t capacity,
|
|
396
|
+
bool strict_capacity_limit);
|
|
565
397
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
bool
|
|
569
|
-
|
|
570
|
-
// Similar to TryRemove, except that it spins, increasing the chances of
|
|
571
|
-
// success. Requires that the caller thread has no shared ref to h.
|
|
572
|
-
bool SpinTryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
|
|
573
|
-
|
|
574
|
-
// Call this function after an Insert, Remove, RemoveAll, TryRemove
|
|
575
|
-
// or SpinTryRemove. It frees the deleted values and updates the hash table
|
|
576
|
-
// metadata.
|
|
577
|
-
void Free(autovector<ClockHandle>* deleted);
|
|
578
|
-
|
|
579
|
-
void ApplyToEntriesRange(std::function<void(ClockHandle*)> func,
|
|
580
|
-
uint32_t index_begin, uint32_t index_end,
|
|
581
|
-
bool apply_if_will_be_deleted) {
|
|
582
|
-
for (uint32_t i = index_begin; i < index_end; i++) {
|
|
583
|
-
ClockHandle* h = &array_[i];
|
|
584
|
-
if (h->TryExclusiveRef()) {
|
|
585
|
-
if (h->IsElement() &&
|
|
586
|
-
(apply_if_will_be_deleted || !h->WillBeDeleted())) {
|
|
587
|
-
func(h);
|
|
588
|
-
}
|
|
589
|
-
h->ReleaseExclusiveRef();
|
|
590
|
-
}
|
|
591
|
-
}
|
|
592
|
-
}
|
|
398
|
+
ClockHandle* Lookup(const CacheKeyBytes& key, uint32_t hash);
|
|
399
|
+
|
|
400
|
+
bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref);
|
|
593
401
|
|
|
594
|
-
void
|
|
402
|
+
void Ref(ClockHandle& handle);
|
|
403
|
+
|
|
404
|
+
void Erase(const CacheKeyBytes& key, uint32_t hash);
|
|
405
|
+
|
|
406
|
+
void ConstApplyToEntriesRange(std::function<void(const ClockHandle&)> func,
|
|
595
407
|
uint32_t index_begin, uint32_t index_end,
|
|
596
|
-
bool apply_if_will_be_deleted) const
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
// We take an external ref because we are handing over control
|
|
600
|
-
// to a user-defined function, and because the handle will not be
|
|
601
|
-
// modified.
|
|
602
|
-
if (h->TryExternalRef()) {
|
|
603
|
-
if (h->IsElement() &&
|
|
604
|
-
(apply_if_will_be_deleted || !h->WillBeDeleted())) {
|
|
605
|
-
func(h);
|
|
606
|
-
}
|
|
607
|
-
h->ReleaseExternalRef();
|
|
608
|
-
}
|
|
609
|
-
}
|
|
610
|
-
}
|
|
408
|
+
bool apply_if_will_be_deleted) const;
|
|
409
|
+
|
|
410
|
+
void EraseUnRefEntries();
|
|
611
411
|
|
|
612
412
|
uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; }
|
|
613
413
|
|
|
@@ -615,22 +415,29 @@ class ClockHandleTable {
|
|
|
615
415
|
|
|
616
416
|
uint32_t GetOccupancyLimit() const { return occupancy_limit_; }
|
|
617
417
|
|
|
618
|
-
uint32_t GetOccupancy() const {
|
|
418
|
+
uint32_t GetOccupancy() const {
|
|
419
|
+
return occupancy_.load(std::memory_order_relaxed);
|
|
420
|
+
}
|
|
619
421
|
|
|
620
|
-
size_t GetUsage() const { return usage_; }
|
|
422
|
+
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
|
|
621
423
|
|
|
622
|
-
size_t
|
|
424
|
+
size_t GetDetachedUsage() const {
|
|
425
|
+
return detached_usage_.load(std::memory_order_relaxed);
|
|
426
|
+
}
|
|
623
427
|
|
|
624
|
-
|
|
428
|
+
// Acquire/release N references
|
|
429
|
+
void TEST_RefN(ClockHandle& handle, size_t n);
|
|
430
|
+
void TEST_ReleaseN(ClockHandle* handle, size_t n);
|
|
625
431
|
|
|
432
|
+
private: // functions
|
|
626
433
|
// Returns x mod 2^{length_bits_}.
|
|
627
434
|
uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
|
|
628
435
|
|
|
629
|
-
|
|
630
|
-
//
|
|
631
|
-
//
|
|
632
|
-
|
|
633
|
-
|
|
436
|
+
// Runs the clock eviction algorithm trying to reclaim at least
|
|
437
|
+
// requested_charge. Returns how much is evicted, which could be less
|
|
438
|
+
// if it appears impossible to evict the requested amount without blocking.
|
|
439
|
+
void Evict(size_t requested_charge, size_t* freed_charge,
|
|
440
|
+
uint32_t* freed_count);
|
|
634
441
|
|
|
635
442
|
// Returns the first slot in the probe sequence, starting from the given
|
|
636
443
|
// probe number, with a handle e such that match(e) is true. At every
|
|
@@ -643,26 +450,17 @@ class ClockHandleTable {
|
|
|
643
450
|
// value of probe is one more than the last non-aborting probe during the
|
|
644
451
|
// call. This is so that that the variable can be used to keep track of
|
|
645
452
|
// progress across consecutive calls to FindSlot.
|
|
646
|
-
inline ClockHandle* FindSlot(
|
|
453
|
+
inline ClockHandle* FindSlot(uint32_t hash,
|
|
647
454
|
std::function<bool(ClockHandle*)> match,
|
|
648
455
|
std::function<bool(ClockHandle*)> stop,
|
|
649
456
|
std::function<void(ClockHandle*)> update,
|
|
650
457
|
uint32_t& probe);
|
|
651
458
|
|
|
652
|
-
//
|
|
653
|
-
//
|
|
654
|
-
|
|
655
|
-
// attempted, and when the attempt succeeds the slot is assigned to
|
|
656
|
-
// the new copy of the element.
|
|
657
|
-
ClockHandle* FindAvailableSlot(const Slice& key, uint32_t hash,
|
|
658
|
-
uint32_t& probe,
|
|
659
|
-
autovector<ClockHandle>* deleted);
|
|
660
|
-
|
|
661
|
-
// After a failed FindSlot call (i.e., with answer -1) in
|
|
662
|
-
// FindAvailableSlot, this function fixes all displacements's
|
|
663
|
-
// starting from the 0-th probe, until the given probe.
|
|
664
|
-
void Rollback(const Slice& key, uint32_t probe);
|
|
459
|
+
// Re-decrement all displacements in probe path starting from beginning
|
|
460
|
+
// until (not including) the given handle
|
|
461
|
+
void Rollback(uint32_t hash, const ClockHandle* h);
|
|
665
462
|
|
|
463
|
+
private: // data
|
|
666
464
|
// Number of hash bits used for table index.
|
|
667
465
|
// The size of the table is 1 << length_bits_.
|
|
668
466
|
const int length_bits_;
|
|
@@ -673,27 +471,26 @@ class ClockHandleTable {
|
|
|
673
471
|
// Maximum number of elements the user can store in the table.
|
|
674
472
|
const uint32_t occupancy_limit_;
|
|
675
473
|
|
|
676
|
-
//
|
|
677
|
-
|
|
474
|
+
// Array of slots comprising the hash table.
|
|
475
|
+
const std::unique_ptr<ClockHandle[]> array_;
|
|
678
476
|
|
|
679
477
|
// We partition the following members into different cache lines
|
|
680
478
|
// to avoid false sharing among Lookup, Release, Erase and Insert
|
|
681
479
|
// operations in ClockCacheShard.
|
|
682
480
|
|
|
683
|
-
ALIGN_AS(CACHE_LINE_SIZE)
|
|
684
|
-
// Array of slots comprising the hash table.
|
|
685
|
-
std::unique_ptr<ClockHandle[]> array_;
|
|
686
|
-
|
|
687
481
|
ALIGN_AS(CACHE_LINE_SIZE)
|
|
688
482
|
// Clock algorithm sweep pointer.
|
|
689
|
-
std::atomic<
|
|
483
|
+
std::atomic<uint64_t> clock_pointer_{};
|
|
690
484
|
|
|
691
485
|
ALIGN_AS(CACHE_LINE_SIZE)
|
|
692
486
|
// Number of elements in the table.
|
|
693
|
-
std::atomic<uint32_t> occupancy_;
|
|
487
|
+
std::atomic<uint32_t> occupancy_{};
|
|
488
|
+
|
|
489
|
+
// Memory usage by entries tracked by the cache (including detached)
|
|
490
|
+
std::atomic<size_t> usage_{};
|
|
694
491
|
|
|
695
|
-
//
|
|
696
|
-
std::atomic<size_t>
|
|
492
|
+
// Part of usage by detached entries (not in table)
|
|
493
|
+
std::atomic<size_t> detached_usage_{};
|
|
697
494
|
}; // class ClockHandleTable
|
|
698
495
|
|
|
699
496
|
// A single shard of sharded cache.
|
|
@@ -704,58 +501,34 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
|
|
|
704
501
|
CacheMetadataChargePolicy metadata_charge_policy);
|
|
705
502
|
~ClockCacheShard() override = default;
|
|
706
503
|
|
|
707
|
-
//
|
|
708
|
-
// if current usage is more than new capacity, the function will attempt to
|
|
709
|
-
// free the needed space.
|
|
504
|
+
// TODO: document limitations
|
|
710
505
|
void SetCapacity(size_t capacity) override;
|
|
711
506
|
|
|
712
|
-
// Set the flag to reject insertion if cache if full.
|
|
713
507
|
void SetStrictCapacityLimit(bool strict_capacity_limit) override;
|
|
714
508
|
|
|
715
|
-
// Like Cache methods, but with an extra "hash" parameter.
|
|
716
|
-
// Insert an item into the hash table and, if handle is null, make it
|
|
717
|
-
// evictable by the clock algorithm. Older items are evicted as necessary.
|
|
718
|
-
// If the cache is full and free_handle_on_fail is true, the item is deleted
|
|
719
|
-
// and handle is set to nullptr.
|
|
720
509
|
Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
|
|
721
510
|
Cache::DeleterFn deleter, Cache::Handle** handle,
|
|
722
511
|
Cache::Priority priority) override;
|
|
723
512
|
|
|
724
|
-
Status Insert(const Slice& key, uint32_t hash, void* value,
|
|
725
|
-
const Cache::CacheItemHelper* helper, size_t charge,
|
|
726
|
-
Cache::Handle** handle, Cache::Priority priority) override {
|
|
727
|
-
return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
|
|
728
|
-
}
|
|
729
|
-
|
|
730
|
-
Cache::Handle* Lookup(const Slice& key, uint32_t hash,
|
|
731
|
-
const Cache::CacheItemHelper* /*helper*/,
|
|
732
|
-
const Cache::CreateCallback& /*create_cb*/,
|
|
733
|
-
Cache::Priority /*priority*/, bool /*wait*/,
|
|
734
|
-
Statistics* /*stats*/) override {
|
|
735
|
-
return Lookup(key, hash);
|
|
736
|
-
}
|
|
737
|
-
|
|
738
513
|
Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
|
|
739
514
|
|
|
740
|
-
bool Release(Cache::Handle* handle, bool
|
|
741
|
-
bool erase_if_last_ref) override
|
|
742
|
-
return Release(handle, erase_if_last_ref);
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
bool IsReady(Cache::Handle* /*handle*/) override { return true; }
|
|
515
|
+
bool Release(Cache::Handle* handle, bool useful,
|
|
516
|
+
bool erase_if_last_ref) override;
|
|
746
517
|
|
|
747
|
-
|
|
518
|
+
bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
|
|
748
519
|
|
|
749
520
|
bool Ref(Cache::Handle* handle) override;
|
|
750
521
|
|
|
751
|
-
bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
|
|
752
|
-
|
|
753
522
|
void Erase(const Slice& key, uint32_t hash) override;
|
|
754
523
|
|
|
755
524
|
size_t GetUsage() const override;
|
|
756
525
|
|
|
757
526
|
size_t GetPinnedUsage() const override;
|
|
758
527
|
|
|
528
|
+
size_t GetOccupancyCount() const override;
|
|
529
|
+
|
|
530
|
+
size_t GetTableAddressCount() const override;
|
|
531
|
+
|
|
759
532
|
void ApplyToSomeEntries(
|
|
760
533
|
const std::function<void(const Slice& key, void* value, size_t charge,
|
|
761
534
|
DeleterFn deleter)>& callback,
|
|
@@ -765,45 +538,64 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
|
|
|
765
538
|
|
|
766
539
|
std::string GetPrintableOptions() const override { return std::string{}; }
|
|
767
540
|
|
|
768
|
-
|
|
541
|
+
// SecondaryCache not yet supported
|
|
542
|
+
Status Insert(const Slice& key, uint32_t hash, void* value,
|
|
543
|
+
const Cache::CacheItemHelper* helper, size_t charge,
|
|
544
|
+
Cache::Handle** handle, Cache::Priority priority) override {
|
|
545
|
+
return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
Cache::Handle* Lookup(const Slice& key, uint32_t hash,
|
|
549
|
+
const Cache::CacheItemHelper* /*helper*/,
|
|
550
|
+
const Cache::CreateCallback& /*create_cb*/,
|
|
551
|
+
Cache::Priority /*priority*/, bool /*wait*/,
|
|
552
|
+
Statistics* /*stats*/) override {
|
|
553
|
+
return Lookup(key, hash);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
bool IsReady(Cache::Handle* /*handle*/) override { return true; }
|
|
557
|
+
|
|
558
|
+
void Wait(Cache::Handle* /*handle*/) override {}
|
|
559
|
+
|
|
560
|
+
// Acquire/release N references
|
|
561
|
+
void TEST_RefN(Cache::Handle* handle, size_t n);
|
|
562
|
+
void TEST_ReleaseN(Cache::Handle* handle, size_t n);
|
|
563
|
+
|
|
564
|
+
private: // functions
|
|
769
565
|
friend class ClockCache;
|
|
770
566
|
friend class ClockCacheTest;
|
|
771
567
|
|
|
772
|
-
ClockHandle* DetachedInsert(
|
|
773
|
-
|
|
774
|
-
// Returns the charge of a single handle.
|
|
775
|
-
static size_t CalcEstimatedHandleCharge(
|
|
776
|
-
size_t estimated_value_size,
|
|
777
|
-
CacheMetadataChargePolicy metadata_charge_policy);
|
|
568
|
+
ClockHandle* DetachedInsert(const ClockHandleMoreData& h);
|
|
778
569
|
|
|
779
570
|
// Returns the number of bits used to hash an element in the hash
|
|
780
571
|
// table.
|
|
781
572
|
static int CalcHashBits(size_t capacity, size_t estimated_value_size,
|
|
782
573
|
CacheMetadataChargePolicy metadata_charge_policy);
|
|
783
574
|
|
|
784
|
-
//
|
|
785
|
-
|
|
575
|
+
private: // data
|
|
576
|
+
ClockHandleTable table_;
|
|
786
577
|
|
|
787
|
-
//
|
|
788
|
-
std::atomic<size_t>
|
|
578
|
+
// Maximum total charge of all elements stored in the table.
|
|
579
|
+
std::atomic<size_t> capacity_;
|
|
789
580
|
|
|
790
|
-
|
|
581
|
+
// Whether to reject insertion if cache reaches its full capacity.
|
|
582
|
+
std::atomic<bool> strict_capacity_limit_;
|
|
791
583
|
}; // class ClockCacheShard
|
|
792
584
|
|
|
793
|
-
class
|
|
585
|
+
class HyperClockCache
|
|
794
586
|
#ifdef NDEBUG
|
|
795
587
|
final
|
|
796
588
|
#endif
|
|
797
589
|
: public ShardedCache {
|
|
798
590
|
public:
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
591
|
+
HyperClockCache(size_t capacity, size_t estimated_value_size,
|
|
592
|
+
int num_shard_bits, bool strict_capacity_limit,
|
|
593
|
+
CacheMetadataChargePolicy metadata_charge_policy =
|
|
594
|
+
kDontChargeCacheMetadata);
|
|
803
595
|
|
|
804
|
-
~
|
|
596
|
+
~HyperClockCache() override;
|
|
805
597
|
|
|
806
|
-
const char* Name() const override { return "
|
|
598
|
+
const char* Name() const override { return "HyperClockCache"; }
|
|
807
599
|
|
|
808
600
|
CacheShard* GetShard(uint32_t shard) override;
|
|
809
601
|
|
|
@@ -823,15 +615,8 @@ class ClockCache
|
|
|
823
615
|
ClockCacheShard* shards_ = nullptr;
|
|
824
616
|
|
|
825
617
|
int num_shards_;
|
|
826
|
-
}; // class
|
|
827
|
-
|
|
828
|
-
} // namespace clock_cache
|
|
618
|
+
}; // class HyperClockCache
|
|
829
619
|
|
|
830
|
-
//
|
|
831
|
-
// TODO(Guido) Remove once NewClockCache constructs a ClockCache again.
|
|
832
|
-
extern std::shared_ptr<Cache> ExperimentalNewClockCache(
|
|
833
|
-
size_t capacity, size_t estimated_value_size, int num_shard_bits,
|
|
834
|
-
bool strict_capacity_limit,
|
|
835
|
-
CacheMetadataChargePolicy metadata_charge_policy);
|
|
620
|
+
} // namespace hyper_clock_cache
|
|
836
621
|
|
|
837
622
|
} // namespace ROCKSDB_NAMESPACE
|