@nxtedition/rocksdb 7.1.5 → 7.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +32 -14
- package/deps/rocksdb/rocksdb/cache/cache.cc +4 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +6 -8
- package/deps/rocksdb/rocksdb/cache/cache_key.cc +184 -164
- package/deps/rocksdb/rocksdb/cache/cache_key.h +38 -29
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +4 -4
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +4 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +11 -9
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +1 -1
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +28 -18
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +86 -17
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +48 -8
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +356 -153
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +3 -7
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +4 -5
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +2 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +12 -4
- package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +69 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +6 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +4 -1
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +222 -182
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +239 -23
- package/deps/rocksdb/rocksdb/db/db_test2.cc +6 -2
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +2 -1
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +6 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +6 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +6 -0
- package/deps/rocksdb/rocksdb/db/kv_checksum.h +8 -4
- package/deps/rocksdb/rocksdb/db/memtable.cc +173 -33
- package/deps/rocksdb/rocksdb/db/memtable.h +10 -0
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -1
- package/deps/rocksdb/rocksdb/db/version_set.cc +37 -18
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +15 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +31 -6
- package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +1 -1
- package/deps/rocksdb/rocksdb/options/cf_options.cc +4 -0
- package/deps/rocksdb/rocksdb/options/cf_options.h +4 -0
- package/deps/rocksdb/rocksdb/options/options_helper.cc +2 -0
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -6
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +1 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +2 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -1
- package/deps/rocksdb/rocksdb/table/unique_id.cc +22 -24
- package/deps/rocksdb/rocksdb/table/unique_id_impl.h +2 -1
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +7 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +27 -3
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +2 -1
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +3 -3
- package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -1
- package/deps/rocksdb/rocksdb/util/hash_test.cc +67 -0
- package/deps/rocksdb/rocksdb/util/math.h +41 -0
- package/deps/rocksdb/rocksdb/util/math128.h +6 -0
- package/deps/rocksdb/rocksdb/util/single_thread_executor.h +2 -1
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +3 -6
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +5 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h +6 -0
- package/index.js +15 -6
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/darwin-x64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
package/binding.cc
CHANGED
|
@@ -1477,7 +1477,7 @@ NAPI_METHOD(batch_clear) {
|
|
|
1477
1477
|
}
|
|
1478
1478
|
|
|
1479
1479
|
NAPI_METHOD(batch_write) {
|
|
1480
|
-
NAPI_ARGV(
|
|
1480
|
+
NAPI_ARGV(4);
|
|
1481
1481
|
|
|
1482
1482
|
Database* database;
|
|
1483
1483
|
NAPI_STATUS_THROWS(napi_get_value_external(env, argv[0], reinterpret_cast<void**>(&database)));
|
|
@@ -1485,8 +1485,34 @@ NAPI_METHOD(batch_write) {
|
|
|
1485
1485
|
rocksdb::WriteBatch* batch;
|
|
1486
1486
|
NAPI_STATUS_THROWS(napi_get_value_external(env, argv[1], reinterpret_cast<void**>(&batch)));
|
|
1487
1487
|
|
|
1488
|
-
|
|
1489
|
-
|
|
1488
|
+
auto options = argv[2];
|
|
1489
|
+
auto callback = argv[3];
|
|
1490
|
+
|
|
1491
|
+
std::optional<bool> sync;
|
|
1492
|
+
NAPI_STATUS_THROWS(GetProperty(env, options, "sync", sync));
|
|
1493
|
+
|
|
1494
|
+
if (sync) {
|
|
1495
|
+
napi_ref batchRef;
|
|
1496
|
+
NAPI_STATUS_THROWS(napi_create_reference(env, argv[1], 1, &batchRef));
|
|
1497
|
+
|
|
1498
|
+
struct State {};
|
|
1499
|
+
runAsync<State>(
|
|
1500
|
+
"leveldown.batch.write", env, callback,
|
|
1501
|
+
[=](auto& state) {
|
|
1502
|
+
rocksdb::WriteOptions writeOptions;
|
|
1503
|
+
writeOptions.sync = *sync;
|
|
1504
|
+
return database->db->Write(writeOptions, batch);
|
|
1505
|
+
},
|
|
1506
|
+
[=](auto& state, auto env, auto& argv) { return napi_delete_reference(env, batchRef); });
|
|
1507
|
+
} else {
|
|
1508
|
+
rocksdb::WriteOptions writeOptions;
|
|
1509
|
+
ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, batch));
|
|
1510
|
+
|
|
1511
|
+
napi_value global;
|
|
1512
|
+
NAPI_STATUS_THROWS(napi_get_global(env, &global));
|
|
1513
|
+
|
|
1514
|
+
NAPI_STATUS_THROWS(napi_call_function(env, global, callback, 0, nullptr, nullptr));
|
|
1515
|
+
}
|
|
1490
1516
|
|
|
1491
1517
|
return 0;
|
|
1492
1518
|
}
|
|
@@ -1563,10 +1589,7 @@ NAPI_METHOD(db_get_sorted_wal_files) {
|
|
|
1563
1589
|
auto callback = argv[1];
|
|
1564
1590
|
|
|
1565
1591
|
runAsync<rocksdb::VectorLogPtr>(
|
|
1566
|
-
"leveldown.open", env, callback,
|
|
1567
|
-
[=](auto& files) {
|
|
1568
|
-
return database->db->GetSortedWalFiles(files);
|
|
1569
|
-
},
|
|
1592
|
+
"leveldown.open", env, callback, [=](auto& files) { return database->db->GetSortedWalFiles(files); },
|
|
1570
1593
|
[=](auto& files, auto env, auto& argv) {
|
|
1571
1594
|
argv.resize(2);
|
|
1572
1595
|
|
|
@@ -1620,13 +1643,8 @@ NAPI_METHOD(db_flush_wal) {
|
|
|
1620
1643
|
auto callback = argv[2];
|
|
1621
1644
|
|
|
1622
1645
|
runAsync<bool>(
|
|
1623
|
-
"leveldown.
|
|
1624
|
-
[=](auto& state) {
|
|
1625
|
-
return database->db->FlushWAL(sync);
|
|
1626
|
-
},
|
|
1627
|
-
[=](auto& state, auto env, auto& argv) {
|
|
1628
|
-
return napi_ok;
|
|
1629
|
-
});
|
|
1646
|
+
"leveldown.flushWal", env, callback, [=](auto& state) { return database->db->FlushWAL(sync); },
|
|
1647
|
+
[=](auto& state, auto env, auto& argv) { return napi_ok; });
|
|
1630
1648
|
|
|
1631
1649
|
return 0;
|
|
1632
1650
|
}
|
|
@@ -33,6 +33,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|
|
33
33
|
{offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
|
|
34
34
|
OptionType::kDouble, OptionVerificationType::kNormal,
|
|
35
35
|
OptionTypeFlags::kMutable}},
|
|
36
|
+
{"low_pri_pool_ratio",
|
|
37
|
+
{offsetof(struct LRUCacheOptions, low_pri_pool_ratio),
|
|
38
|
+
OptionType::kDouble, OptionVerificationType::kNormal,
|
|
39
|
+
OptionTypeFlags::kMutable}},
|
|
36
40
|
};
|
|
37
41
|
|
|
38
42
|
static std::unordered_map<std::string, OptionTypeInfo>
|
|
@@ -304,7 +304,9 @@ class CacheBench {
|
|
|
304
304
|
FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits,
|
|
305
305
|
false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
|
|
306
306
|
} else if (FLAGS_cache_type == "lru_cache") {
|
|
307
|
-
LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
|
|
307
|
+
LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
|
|
308
|
+
false /* strict_capacity_limit */,
|
|
309
|
+
0.5 /* high_pri_pool_ratio */);
|
|
308
310
|
#ifndef ROCKSDB_LITE
|
|
309
311
|
if (!FLAGS_secondary_cache_uri.empty()) {
|
|
310
312
|
Status s = SecondaryCache::CreateFromString(
|
|
@@ -806,7 +808,6 @@ class StressCacheKey {
|
|
|
806
808
|
|
|
807
809
|
uint64_t max_file_count =
|
|
808
810
|
uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
|
|
809
|
-
uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
|
|
810
811
|
uint32_t report_count = 0;
|
|
811
812
|
uint32_t collisions_this_run = 0;
|
|
812
813
|
size_t db_i = 0;
|
|
@@ -834,8 +835,7 @@ class StressCacheKey {
|
|
|
834
835
|
}
|
|
835
836
|
bool is_stable;
|
|
836
837
|
BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
|
|
837
|
-
/* ignored */ 42,
|
|
838
|
-
&is_stable);
|
|
838
|
+
/* ignored */ 42, &ock, &is_stable);
|
|
839
839
|
assert(is_stable);
|
|
840
840
|
// Get a representative cache key, which later we analytically generalize
|
|
841
841
|
// to a range.
|
|
@@ -845,13 +845,11 @@ class StressCacheKey {
|
|
|
845
845
|
reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
|
|
846
846
|
} else if (FLAGS_sck_footer_unique_id) {
|
|
847
847
|
// Special case: keep only file number, not session counter
|
|
848
|
-
|
|
849
|
-
uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
|
|
850
|
-
reduced_key = (uint64_t{a} << 32) + b;
|
|
848
|
+
reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
|
|
851
849
|
} else {
|
|
852
850
|
// Try to keep file number and session counter (shift away other bits)
|
|
853
851
|
uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
|
|
854
|
-
uint32_t b = DecodeFixed32(ck.AsSlice().data() +
|
|
852
|
+
uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
|
|
855
853
|
reduced_key = (uint64_t{a} << 32) + b;
|
|
856
854
|
}
|
|
857
855
|
if (reduced_key == 0) {
|
|
@@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE {
|
|
|
17
17
|
|
|
18
18
|
// Value space plan for CacheKey:
|
|
19
19
|
//
|
|
20
|
-
//
|
|
20
|
+
// file_num_etc64_ | offset_etc64_ | Only generated by
|
|
21
21
|
// ---------------+---------------+------------------------------------------
|
|
22
22
|
// 0 | 0 | Reserved for "empty" CacheKey()
|
|
23
23
|
// 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime
|
|
@@ -44,7 +44,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
44
44
|
return CacheKey(0, id);
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
//
|
|
47
|
+
// How we generate CacheKeys and base OffsetableCacheKey, assuming that
|
|
48
48
|
// db_session_ids are generated from a base_session_id and
|
|
49
49
|
// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
|
|
50
50
|
// in DBImpl::GenerateDbSessionId):
|
|
@@ -56,63 +56,108 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
56
56
|
// base_session_id (unstructured, from GenerateRawUniqueId)
|
|
57
57
|
// session_id_counter (structured)
|
|
58
58
|
// * usually much smaller than 2**24
|
|
59
|
-
//
|
|
59
|
+
// orig_file_number (structured)
|
|
60
60
|
// * usually smaller than 2**24
|
|
61
61
|
// offset_in_file (structured, might skip lots of values)
|
|
62
62
|
// * usually smaller than 2**32
|
|
63
|
-
// max_offset determines placement of file_number to prevent
|
|
64
|
-
// overlapping with offset
|
|
65
63
|
//
|
|
66
|
-
//
|
|
67
|
-
//
|
|
68
|
-
//
|
|
69
|
-
//
|
|
64
|
+
// Overall approach (see https://github.com/pdillinger/unique_id for
|
|
65
|
+
// background):
|
|
66
|
+
//
|
|
67
|
+
// First, we have three "structured" values, up to 64 bits each, that we
|
|
68
|
+
// need to fit, without losses, into 128 bits. In practice, the values will
|
|
69
|
+
// be small enough that they should fit. For example, applications generating
|
|
70
|
+
// large SST files (large offsets) will naturally produce fewer files (small
|
|
71
|
+
// file numbers). But we don't know ahead of time what bounds the values will
|
|
72
|
+
// have.
|
|
73
|
+
//
|
|
74
|
+
// Second, we have unstructured inputs that enable distinct RocksDB processes
|
|
75
|
+
// to pick a random point in space, likely very different from others. Xoring
|
|
76
|
+
// the structured with the unstructured give us a cache key that is
|
|
77
|
+
// structurally distinct between related keys (e.g. same file or same RocksDB
|
|
78
|
+
// process) and distinct with high probability between unrelated keys.
|
|
79
|
+
//
|
|
80
|
+
// The problem of packing three structured values into the space for two is
|
|
81
|
+
// complicated by the fact that we want to derive cache keys from SST unique
|
|
82
|
+
// IDs, which have already combined structured and unstructured inputs in a
|
|
83
|
+
// practically inseparable way. And we want a base cache key that works
|
|
84
|
+
// with an offset of any size. So basically, we need to encode these three
|
|
85
|
+
// structured values, each up to 64 bits, into 128 bits without knowing any
|
|
86
|
+
// of their sizes. The DownwardInvolution() function gives us a mechanism to
|
|
87
|
+
// accomplish this. (See its properties in math.h.) Specifically, for inputs
|
|
88
|
+
// a, b, and c:
|
|
89
|
+
// lower64 = DownwardInvolution(a) ^ ReverseBits(b);
|
|
90
|
+
// upper64 = c ^ ReverseBits(a);
|
|
91
|
+
// The 128-bit output is unique assuming there exist some i, j, and k
|
|
92
|
+
// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
|
|
93
|
+
// i + j + k <= 128. In other words, as long as there exist some bounds
|
|
94
|
+
// that would allow us to pack the bits of a, b, and c into the output
|
|
95
|
+
// if we know the bound, we can generate unique outputs without knowing
|
|
96
|
+
// those bounds. To validate this claim, the inversion function (given
|
|
97
|
+
// the bounds) has been implemented in CacheKeyDecoder in
|
|
98
|
+
// db_block_cache_test.cc.
|
|
99
|
+
//
|
|
100
|
+
// With that in mind, the outputs in terms of the conceptual inputs look
|
|
101
|
+
// like this, using bitwise-xor of the constituent pieces, low bits on left:
|
|
102
|
+
//
|
|
103
|
+
// |------------------------- file_num_etc64 -------------------------|
|
|
104
|
+
// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
|
|
105
|
+
// |-----------------------------------------------------------------|
|
|
106
|
+
// | session_id_counter (involution) ..... | |
|
|
70
107
|
// |-----------------------------------------------------------------|
|
|
71
|
-
// |
|
|
108
|
+
// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
|
|
109
|
+
// | * base_session_id (upper ~39 bits) |
|
|
110
|
+
// | * db_id (~122 bits entropy) |
|
|
72
111
|
// |-----------------------------------------------------------------|
|
|
73
|
-
// |
|
|
74
|
-
// | | overflow & meta |
|
|
112
|
+
// | | ..... orig_file_number (reversed) |
|
|
75
113
|
// |-----------------------------------------------------------------|
|
|
76
114
|
//
|
|
77
115
|
//
|
|
78
116
|
// |------------------------- offset_etc64 --------------------------|
|
|
79
|
-
// |
|
|
80
|
-
// | * base_session_id (upper ~39 bits) |
|
|
81
|
-
// | * db_id (~122 bits entropy) |
|
|
117
|
+
// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
|
|
82
118
|
// |-----------------------------------------------------------------|
|
|
83
|
-
// |
|
|
119
|
+
// | | ..... session_id_counter (reversed) |
|
|
84
120
|
// |-----------------------------------------------------------------|
|
|
85
|
-
// |
|
|
86
|
-
// | | lower bytes |
|
|
121
|
+
// | offset_in_file ............... | |
|
|
87
122
|
// |-----------------------------------------------------------------|
|
|
88
123
|
//
|
|
89
|
-
//
|
|
90
|
-
//
|
|
91
|
-
//
|
|
92
|
-
//
|
|
93
|
-
//
|
|
94
|
-
//
|
|
95
|
-
//
|
|
96
|
-
//
|
|
97
|
-
//
|
|
98
|
-
//
|
|
99
|
-
//
|
|
100
|
-
//
|
|
101
|
-
//
|
|
102
|
-
//
|
|
103
|
-
//
|
|
104
|
-
//
|
|
105
|
-
//
|
|
106
|
-
//
|
|
107
|
-
//
|
|
108
|
-
//
|
|
109
|
-
//
|
|
110
|
-
//
|
|
111
|
-
//
|
|
112
|
-
//
|
|
113
|
-
//
|
|
114
|
-
//
|
|
115
|
-
//
|
|
124
|
+
// Some oddities or inconveniences of this layout are due to deriving
|
|
125
|
+
// the "base" cache key (without offset) from the SST unique ID (see
|
|
126
|
+
// GetSstInternalUniqueId). Specifically,
|
|
127
|
+
// * Lower 64 of base_session_id occurs in both output words (ok but
|
|
128
|
+
// weird)
|
|
129
|
+
// * The inclusion of db_id is bad for the conditions under which we
|
|
130
|
+
// can guarantee uniqueness, but could be useful in some cases with
|
|
131
|
+
// few small files per process, to make up for db session id only having
|
|
132
|
+
// ~103 bits of entropy.
|
|
133
|
+
//
|
|
134
|
+
// In fact, if DB ids were not involved, we would be guaranteed unique
|
|
135
|
+
// cache keys for files generated in a single process until total bits for
|
|
136
|
+
// biggest session_id_counter, orig_file_number, and offset_in_file
|
|
137
|
+
// reach 128 bits.
|
|
138
|
+
//
|
|
139
|
+
// With the DB id limitation, we only have nice guaranteed unique cache
|
|
140
|
+
// keys for files generated in a single process until biggest
|
|
141
|
+
// session_id_counter and offset_in_file reach combined 64 bits. This
|
|
142
|
+
// is quite good in practice because we can have millions of DB Opens
|
|
143
|
+
// with terabyte size SST files, or billions of DB Opens with gigabyte
|
|
144
|
+
// size SST files.
|
|
145
|
+
//
|
|
146
|
+
// One of the considerations in the translation between existing SST unique
|
|
147
|
+
// IDs and base cache keys is supporting better SST unique IDs in a future
|
|
148
|
+
// format_version. If we use a process-wide file counter instead of
|
|
149
|
+
// session counter and file numbers, we only need to combine two 64-bit values
|
|
150
|
+
// instead of three. But we don't want to track unique ID versions in the
|
|
151
|
+
// manifest, so we want to keep the same translation layer between SST unique
|
|
152
|
+
// IDs and base cache keys, even with updated SST unique IDs. If the new
|
|
153
|
+
// unique IDs put the file counter where the orig_file_number was, and
|
|
154
|
+
// use no structured field where session_id_counter was, then our translation
|
|
155
|
+
// layer works fine for two structured fields as well as three (for
|
|
156
|
+
// compatibility). The small computation for the translation (one
|
|
157
|
+
// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
|
|
158
|
+
// is negligible for computing as part of SST file reader open.
|
|
159
|
+
//
|
|
160
|
+
// More on how https://github.com/pdillinger/unique_id applies here:
|
|
116
161
|
// Every bit of output always includes "unstructured" uniqueness bits and
|
|
117
162
|
// often combines with "structured" uniqueness bits. The "unstructured" bits
|
|
118
163
|
// change infrequently: only when we cannot guarantee our state tracking for
|
|
@@ -141,12 +186,11 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
141
186
|
// 128 bits cache key size
|
|
142
187
|
// - 55 <- ideal size for byte offsets + file numbers
|
|
143
188
|
// - 2 <- bits for offsets and file numbers not exactly powers of two
|
|
144
|
-
// - 2 <- bits for file number encoding metadata
|
|
145
189
|
// + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
|
|
146
190
|
// ----
|
|
147
|
-
//
|
|
148
|
-
// The probability of a collision in
|
|
149
|
-
// 1 in 2**(
|
|
191
|
+
// 73 <- bits remaining for distinguishing session IDs
|
|
192
|
+
// The probability of a collision in 73 bits of session ID data is less than
|
|
193
|
+
// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
|
|
150
194
|
// data from the last 180 days is in cache for potential collision, and that
|
|
151
195
|
// cache keys under each session id exhaustively cover the remaining 57 bits
|
|
152
196
|
// while in reality they'll only cover a small fraction of it.
|
|
@@ -160,7 +204,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
160
204
|
// Now suppose we have many DBs per host, say 2**10, with same host-wide write
|
|
161
205
|
// rate and process/session lifetime. File numbers will be ~10 bits smaller
|
|
162
206
|
// and we will have 2**10 times as many session IDs because of simultaneous
|
|
163
|
-
// lifetimes. So now collision chance is less than 1 in 2**(
|
|
207
|
+
// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
|
|
164
208
|
// or roughly 1 in a billion.
|
|
165
209
|
//
|
|
166
210
|
// Suppose instead we generated random or hashed cache keys for each
|
|
@@ -176,17 +220,17 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
176
220
|
// activity over many months, by making some pessimistic simplifying
|
|
177
221
|
// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
|
|
178
222
|
// Here is some sample output with
|
|
179
|
-
// `./cache_bench -stress_cache_key -sck_keep_bits=
|
|
223
|
+
// `./cache_bench -stress_cache_key -sck_keep_bits=43`:
|
|
180
224
|
//
|
|
181
225
|
// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
|
|
182
|
-
// Multiply by
|
|
226
|
+
// Multiply by 1.15292e+18 to correct for simulation losses (but still
|
|
183
227
|
// assume whole file cached)
|
|
184
228
|
//
|
|
185
229
|
// These come from default settings of 2.5M files per day of 32 MB each, and
|
|
186
|
-
// `-sck_keep_bits=
|
|
187
|
-
// keeping
|
|
188
|
-
// contiguous keys (pessimistic), our simulation is about 2\*\*(128-
|
|
189
|
-
// about
|
|
230
|
+
// `-sck_keep_bits=43` means that to represent a single file, we are only
|
|
231
|
+
// keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25
|
|
232
|
+
// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
|
|
233
|
+
// about 1 billion billion times more prone to collision than reality.
|
|
190
234
|
//
|
|
191
235
|
// More default assumptions, relatively pessimistic:
|
|
192
236
|
// * 100 DBs in same process (doesn't matter much)
|
|
@@ -194,49 +238,55 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
194
238
|
// average every 100 files generated
|
|
195
239
|
// * Restart process (all new session IDs unrelated to old) 24 times per day
|
|
196
240
|
//
|
|
197
|
-
// After enough data, we get a result at the end (-sck_keep_bits=
|
|
241
|
+
// After enough data, we get a result at the end (-sck_keep_bits=43):
|
|
198
242
|
//
|
|
199
|
-
// (keep
|
|
200
|
-
// (
|
|
243
|
+
// (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between
|
|
244
|
+
// (1.15292e+19 corrected)
|
|
201
245
|
//
|
|
202
246
|
// If we believe the (pessimistic) simulation and the mathematical
|
|
203
|
-
// extrapolation, we would need to run a billion machines all for
|
|
247
|
+
// extrapolation, we would need to run a billion machines all for 11 billion
|
|
204
248
|
// days to expect a cache key collision. To help verify that our extrapolation
|
|
205
|
-
// ("corrected") is robust, we can make our simulation more precise
|
|
206
|
-
//
|
|
249
|
+
// ("corrected") is robust, we can make our simulation more precise by
|
|
250
|
+
// increasing the "keep" bits, which takes more running time to get enough
|
|
207
251
|
// collision data:
|
|
208
252
|
//
|
|
209
|
-
// (keep
|
|
210
|
-
// (1.
|
|
211
|
-
// (keep
|
|
212
|
-
// (1.
|
|
253
|
+
// (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between
|
|
254
|
+
// (1.6213e+19 corrected)
|
|
255
|
+
// (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between
|
|
256
|
+
// (1.21057e+19 corrected)
|
|
257
|
+
// (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between
|
|
258
|
+
// (1.46997e+19 corrected)
|
|
259
|
+
// (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between
|
|
260
|
+
// (2.11849e+19 corrected)
|
|
213
261
|
//
|
|
214
|
-
// The extrapolated prediction
|
|
215
|
-
// very small losses of structured data (see class StressCacheKey in
|
|
216
|
-
// cache_bench_tool.cc) leading to more accurate & more attractive prediction
|
|
217
|
-
// with more bits kept.
|
|
262
|
+
// The extrapolated prediction seems to be within noise (sampling error).
|
|
218
263
|
//
|
|
219
264
|
// With the `-sck_randomize` option, we can see that typical workloads like
|
|
220
265
|
// above have lower collision probability than "random" cache keys (note:
|
|
221
|
-
// offsets still non-randomized) by a modest amount (roughly
|
|
222
|
-
// prone than random), which should make us reasonably comfortable
|
|
223
|
-
// "degenerate" cases (e.g. repeatedly launch a process to generate
|
|
224
|
-
// with SstFileWriter):
|
|
266
|
+
// offsets still non-randomized) by a modest amount (roughly 2-3x less
|
|
267
|
+
// collision prone than random), which should make us reasonably comfortable
|
|
268
|
+
// even in "degenerate" cases (e.g. repeatedly launch a process to generate
|
|
269
|
+
// one file with SstFileWriter):
|
|
270
|
+
//
|
|
271
|
+
// (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
|
|
272
|
+
// (4.7165e+18 corrected)
|
|
273
|
+
//
|
|
274
|
+
// We can see that with more frequent process restarts,
|
|
275
|
+
// -sck_restarts_per_day=5000, which means more all-new session IDs, we get
|
|
276
|
+
// closer to the "random" cache key performance:
|
|
225
277
|
//
|
|
226
|
-
//
|
|
227
|
-
// (4.21372e+18 corrected)
|
|
278
|
+
// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
|
|
228
279
|
//
|
|
229
|
-
//
|
|
230
|
-
// we get
|
|
280
|
+
// And with less frequent process restarts and re-opens,
|
|
281
|
+
// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
|
|
282
|
+
// probability:
|
|
231
283
|
//
|
|
232
|
-
//
|
|
233
|
-
// (5.92931e+18 corrected)
|
|
284
|
+
// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
|
|
234
285
|
//
|
|
235
286
|
// Other tests have been run to validate other conditions behave as expected,
|
|
236
287
|
// never behaving "worse than random" unless we start chopping off structured
|
|
237
288
|
// data.
|
|
238
289
|
//
|
|
239
|
-
//
|
|
240
290
|
// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
|
|
241
291
|
// that only arise when a new process is started, the chance of any cache key
|
|
242
292
|
// collisions in a giant fleet of machines is negligible. Especially when
|
|
@@ -249,96 +299,66 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
|
|
249
299
|
// quantify) block cache corruptions, including collisions, should be added.
|
|
250
300
|
OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
|
|
251
301
|
const std::string &db_session_id,
|
|
252
|
-
uint64_t file_number
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
// See also https://github.com/pdillinger/unique_id for background.
|
|
260
|
-
uint64_t session_upper = 0; // Assignment to appease clang-analyze
|
|
261
|
-
uint64_t session_lower = 0; // Assignment to appease clang-analyze
|
|
262
|
-
{
|
|
263
|
-
Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
|
|
264
|
-
if (!s.ok()) {
|
|
265
|
-
// A reasonable fallback in case malformed
|
|
266
|
-
Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
|
|
267
|
-
&session_lower);
|
|
268
|
-
}
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
// Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
|
|
272
|
-
// for more global uniqueness entropy.
|
|
273
|
-
// (It is possible that many DBs descended from one common DB id are copied
|
|
274
|
-
// around and proliferate, in which case session id is critical, but it is
|
|
275
|
-
// more common for different DBs to have different DB ids.)
|
|
276
|
-
uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper);
|
|
277
|
-
|
|
278
|
-
// This establishes the db+session id part of the cache key.
|
|
279
|
-
//
|
|
280
|
-
// Exactly preserve (in common cases; see modifiers below) session lower to
|
|
281
|
-
// ensure that session ids generated during the same process lifetime are
|
|
282
|
-
// guaranteed unique.
|
|
283
|
-
//
|
|
284
|
-
// We put this first for CommonPrefixSlice(), so that a small-ish set of
|
|
285
|
-
// cache key prefixes to cover entries relevant to any DB.
|
|
286
|
-
session_etc64_ = session_lower;
|
|
287
|
-
// This provides extra entopy in case of different DB id or process
|
|
288
|
-
// generating a session id, but is also partly/variably obscured by
|
|
289
|
-
// file_number and offset (see below).
|
|
290
|
-
offset_etc64_ = db_hash;
|
|
291
|
-
|
|
292
|
-
// Into offset_etc64_ we are (eventually) going to pack & xor in an offset and
|
|
293
|
-
// a file_number, but we might need the file_number to overflow into
|
|
294
|
-
// session_etc64_. (There must only be one session_etc64_ value per
|
|
295
|
-
// file, and preferably shared among many files.)
|
|
296
|
-
//
|
|
297
|
-
// Figure out how many bytes of file_number we are going to be able to
|
|
298
|
-
// pack in with max_offset, though our encoding will only support packing
|
|
299
|
-
// in up to 3 bytes of file_number. (16M file numbers is enough for a new
|
|
300
|
-
// file number every second for half a year.)
|
|
301
|
-
int file_number_bytes_in_offset_etc =
|
|
302
|
-
(63 - FloorLog2(max_offset | 0x100000000U)) / 8;
|
|
303
|
-
int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8;
|
|
302
|
+
uint64_t file_number) {
|
|
303
|
+
UniqueId64x2 internal_id;
|
|
304
|
+
Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
|
|
305
|
+
&internal_id, /*force=*/true);
|
|
306
|
+
assert(s.ok());
|
|
307
|
+
*this = FromInternalUniqueId(&internal_id);
|
|
308
|
+
}
|
|
304
309
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
// Assert we couldn't have used a larger allowed number of bytes (shift
|
|
309
|
-
// would chop off bytes).
|
|
310
|
-
assert(file_number_bytes_in_offset_etc == 3 ||
|
|
311
|
-
(max_offset << (file_number_bits_in_offset_etc + 8) >>
|
|
312
|
-
(file_number_bits_in_offset_etc + 8)) != max_offset);
|
|
310
|
+
OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
|
|
311
|
+
uint64_t session_lower = id.ptr[0];
|
|
312
|
+
uint64_t file_num_etc = id.ptr[1];
|
|
313
313
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
uint64_t offset_etc_modifier = ReverseBits(file_number & mask);
|
|
318
|
-
assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U);
|
|
314
|
+
#ifndef NDEBUG
|
|
315
|
+
bool is_empty = session_lower == 0 && file_num_etc == 0;
|
|
316
|
+
#endif
|
|
319
317
|
|
|
320
|
-
//
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
//
|
|
325
|
-
|
|
318
|
+
// Although DBImpl guarantees (in recent versions) that session_lower is not
|
|
319
|
+
// zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
|
|
320
|
+
// not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
|
|
321
|
+
// However, if we are given an "empty" id as input, then we should produce
|
|
322
|
+
// "empty" as output.
|
|
323
|
+
// As a consequence, this function is only bijective assuming
|
|
324
|
+
// id[0] == 0 only if id[1] == 0.
|
|
325
|
+
if (session_lower == 0U) {
|
|
326
|
+
session_lower = file_num_etc;
|
|
327
|
+
}
|
|
326
328
|
|
|
327
|
-
//
|
|
328
|
-
|
|
329
|
-
|
|
329
|
+
// See comments above for how DownwardInvolution and ReverseBits
|
|
330
|
+
// make this function invertible under various assumptions.
|
|
331
|
+
OffsetableCacheKey rv;
|
|
332
|
+
rv.file_num_etc64_ =
|
|
333
|
+
DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
|
|
334
|
+
rv.offset_etc64_ = ReverseBits(session_lower);
|
|
330
335
|
|
|
331
|
-
//
|
|
332
|
-
|
|
333
|
-
|
|
336
|
+
// Because of these transformations and needing to allow arbitrary
|
|
337
|
+
// offset (thus, second 64 bits of cache key might be 0), we need to
|
|
338
|
+
// make some correction to ensure the first 64 bits is not 0.
|
|
339
|
+
// Fortunately, the transformation ensures the second 64 bits is not 0
|
|
340
|
+
// for non-empty base key, so we can swap in the case one is 0 without
|
|
341
|
+
// breaking bijectivity (assuming condition above).
|
|
342
|
+
assert(is_empty || rv.offset_etc64_ > 0);
|
|
343
|
+
if (rv.file_num_etc64_ == 0) {
|
|
344
|
+
std::swap(rv.file_num_etc64_, rv.offset_etc64_);
|
|
345
|
+
}
|
|
346
|
+
assert(is_empty || rv.file_num_etc64_ > 0);
|
|
347
|
+
return rv;
|
|
348
|
+
}
|
|
334
349
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
350
|
+
// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
|
|
351
|
+
// offset_etc64 == 0)
|
|
352
|
+
UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
|
|
353
|
+
uint64_t a = file_num_etc64_;
|
|
354
|
+
uint64_t b = offset_etc64_;
|
|
355
|
+
if (b == 0) {
|
|
356
|
+
std::swap(a, b);
|
|
340
357
|
}
|
|
341
|
-
|
|
358
|
+
UniqueId64x2 rv;
|
|
359
|
+
rv[0] = ReverseBits(b);
|
|
360
|
+
rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
|
|
361
|
+
return rv;
|
|
342
362
|
}
|
|
343
363
|
|
|
344
364
|
} // namespace ROCKSDB_NAMESPACE
|