@nxtedition/rocksdb 7.1.5 → 7.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/binding.cc +32 -14
  2. package/deps/rocksdb/rocksdb/cache/cache.cc +4 -0
  3. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +6 -8
  4. package/deps/rocksdb/rocksdb/cache/cache_key.cc +184 -164
  5. package/deps/rocksdb/rocksdb/cache/cache_key.h +38 -29
  6. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +4 -4
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +4 -2
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +11 -9
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +1 -1
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +28 -18
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +86 -17
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +48 -8
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +356 -153
  14. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +3 -7
  15. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +4 -5
  16. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +2 -3
  17. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +12 -4
  18. package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +69 -0
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +6 -1
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +4 -1
  21. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +222 -182
  22. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +239 -23
  23. package/deps/rocksdb/rocksdb/db/db_test2.cc +6 -2
  24. package/deps/rocksdb/rocksdb/db/event_helpers.cc +2 -1
  25. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +6 -0
  26. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +6 -0
  27. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +6 -0
  28. package/deps/rocksdb/rocksdb/db/kv_checksum.h +8 -4
  29. package/deps/rocksdb/rocksdb/db/memtable.cc +173 -33
  30. package/deps/rocksdb/rocksdb/db/memtable.h +10 -0
  31. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -1
  32. package/deps/rocksdb/rocksdb/db/version_set.cc +37 -18
  33. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -1
  34. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  35. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +6 -0
  36. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -0
  37. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +15 -0
  38. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +31 -6
  39. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +1 -1
  40. package/deps/rocksdb/rocksdb/options/cf_options.cc +4 -0
  41. package/deps/rocksdb/rocksdb/options/cf_options.h +4 -0
  42. package/deps/rocksdb/rocksdb/options/options_helper.cc +2 -0
  43. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +2 -1
  44. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -6
  45. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +1 -0
  46. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +2 -4
  47. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -7
  48. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -1
  49. package/deps/rocksdb/rocksdb/table/unique_id.cc +22 -24
  50. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +2 -1
  51. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +7 -0
  52. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +27 -3
  53. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +2 -1
  54. package/deps/rocksdb/rocksdb/util/async_file_reader.h +3 -3
  55. package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -1
  56. package/deps/rocksdb/rocksdb/util/hash_test.cc +67 -0
  57. package/deps/rocksdb/rocksdb/util/math.h +41 -0
  58. package/deps/rocksdb/rocksdb/util/math128.h +6 -0
  59. package/deps/rocksdb/rocksdb/util/single_thread_executor.h +2 -1
  60. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +3 -6
  61. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +5 -0
  62. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h +6 -0
  63. package/index.js +15 -6
  64. package/package.json +1 -1
  65. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  66. package/prebuilds/darwin-x64/node.napi.node +0 -0
  67. package/prebuilds/linux-x64/node.napi.node +0 -0
package/binding.cc CHANGED
@@ -1477,7 +1477,7 @@ NAPI_METHOD(batch_clear) {
1477
1477
  }
1478
1478
 
1479
1479
  NAPI_METHOD(batch_write) {
1480
- NAPI_ARGV(3);
1480
+ NAPI_ARGV(4);
1481
1481
 
1482
1482
  Database* database;
1483
1483
  NAPI_STATUS_THROWS(napi_get_value_external(env, argv[0], reinterpret_cast<void**>(&database)));
@@ -1485,8 +1485,34 @@ NAPI_METHOD(batch_write) {
1485
1485
  rocksdb::WriteBatch* batch;
1486
1486
  NAPI_STATUS_THROWS(napi_get_value_external(env, argv[1], reinterpret_cast<void**>(&batch)));
1487
1487
 
1488
- rocksdb::WriteOptions writeOptions;
1489
- ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, batch));
1488
+ auto options = argv[2];
1489
+ auto callback = argv[3];
1490
+
1491
+ std::optional<bool> sync;
1492
+ NAPI_STATUS_THROWS(GetProperty(env, options, "sync", sync));
1493
+
1494
+ if (sync) {
1495
+ napi_ref batchRef;
1496
+ NAPI_STATUS_THROWS(napi_create_reference(env, argv[1], 1, &batchRef));
1497
+
1498
+ struct State {};
1499
+ runAsync<State>(
1500
+ "leveldown.batch.write", env, callback,
1501
+ [=](auto& state) {
1502
+ rocksdb::WriteOptions writeOptions;
1503
+ writeOptions.sync = *sync;
1504
+ return database->db->Write(writeOptions, batch);
1505
+ },
1506
+ [=](auto& state, auto env, auto& argv) { return napi_delete_reference(env, batchRef); });
1507
+ } else {
1508
+ rocksdb::WriteOptions writeOptions;
1509
+ ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, batch));
1510
+
1511
+ napi_value global;
1512
+ NAPI_STATUS_THROWS(napi_get_global(env, &global));
1513
+
1514
+ NAPI_STATUS_THROWS(napi_call_function(env, global, callback, 0, nullptr, nullptr));
1515
+ }
1490
1516
 
1491
1517
  return 0;
1492
1518
  }
@@ -1563,10 +1589,7 @@ NAPI_METHOD(db_get_sorted_wal_files) {
1563
1589
  auto callback = argv[1];
1564
1590
 
1565
1591
  runAsync<rocksdb::VectorLogPtr>(
1566
- "leveldown.open", env, callback,
1567
- [=](auto& files) {
1568
- return database->db->GetSortedWalFiles(files);
1569
- },
1592
+ "leveldown.open", env, callback, [=](auto& files) { return database->db->GetSortedWalFiles(files); },
1570
1593
  [=](auto& files, auto env, auto& argv) {
1571
1594
  argv.resize(2);
1572
1595
 
@@ -1620,13 +1643,8 @@ NAPI_METHOD(db_flush_wal) {
1620
1643
  auto callback = argv[2];
1621
1644
 
1622
1645
  runAsync<bool>(
1623
- "leveldown.open", env, callback,
1624
- [=](auto& state) {
1625
- return database->db->FlushWAL(sync);
1626
- },
1627
- [=](auto& state, auto env, auto& argv) {
1628
- return napi_ok;
1629
- });
1646
+ "leveldown.flushWal", env, callback, [=](auto& state) { return database->db->FlushWAL(sync); },
1647
+ [=](auto& state, auto env, auto& argv) { return napi_ok; });
1630
1648
 
1631
1649
  return 0;
1632
1650
  }
@@ -33,6 +33,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
33
33
  {offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
34
34
  OptionType::kDouble, OptionVerificationType::kNormal,
35
35
  OptionTypeFlags::kMutable}},
36
+ {"low_pri_pool_ratio",
37
+ {offsetof(struct LRUCacheOptions, low_pri_pool_ratio),
38
+ OptionType::kDouble, OptionVerificationType::kNormal,
39
+ OptionTypeFlags::kMutable}},
36
40
  };
37
41
 
38
42
  static std::unordered_map<std::string, OptionTypeInfo>
@@ -304,7 +304,9 @@ class CacheBench {
304
304
  FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits,
305
305
  false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
306
306
  } else if (FLAGS_cache_type == "lru_cache") {
307
- LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5);
307
+ LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
308
+ false /* strict_capacity_limit */,
309
+ 0.5 /* high_pri_pool_ratio */);
308
310
  #ifndef ROCKSDB_LITE
309
311
  if (!FLAGS_secondary_cache_uri.empty()) {
310
312
  Status s = SecondaryCache::CreateFromString(
@@ -806,7 +808,6 @@ class StressCacheKey {
806
808
 
807
809
  uint64_t max_file_count =
808
810
  uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
809
- uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
810
811
  uint32_t report_count = 0;
811
812
  uint32_t collisions_this_run = 0;
812
813
  size_t db_i = 0;
@@ -834,8 +835,7 @@ class StressCacheKey {
834
835
  }
835
836
  bool is_stable;
836
837
  BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
837
- /* ignored */ 42, file_size, &ock,
838
- &is_stable);
838
+ /* ignored */ 42, &ock, &is_stable);
839
839
  assert(is_stable);
840
840
  // Get a representative cache key, which later we analytically generalize
841
841
  // to a range.
@@ -845,13 +845,11 @@ class StressCacheKey {
845
845
  reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
846
846
  } else if (FLAGS_sck_footer_unique_id) {
847
847
  // Special case: keep only file number, not session counter
848
- uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
849
- uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
850
- reduced_key = (uint64_t{a} << 32) + b;
848
+ reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
851
849
  } else {
852
850
  // Try to keep file number and session counter (shift away other bits)
853
851
  uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
854
- uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
852
+ uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
855
853
  reduced_key = (uint64_t{a} << 32) + b;
856
854
  }
857
855
  if (reduced_key == 0) {
@@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE {
17
17
 
18
18
  // Value space plan for CacheKey:
19
19
  //
20
- // session_etc64_ | offset_etc64_ | Only generated by
20
+ // file_num_etc64_ | offset_etc64_ | Only generated by
21
21
  // ---------------+---------------+------------------------------------------
22
22
  // 0 | 0 | Reserved for "empty" CacheKey()
23
23
  // 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime
@@ -44,7 +44,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
44
44
  return CacheKey(0, id);
45
45
  }
46
46
 
47
- // Value plan for CacheKeys from OffsetableCacheKey, assuming that
47
+ // How we generate CacheKeys and base OffsetableCacheKey, assuming that
48
48
  // db_session_ids are generated from a base_session_id and
49
49
  // session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
50
50
  // in DBImpl::GenerateDbSessionId):
@@ -56,63 +56,108 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
56
56
  // base_session_id (unstructured, from GenerateRawUniqueId)
57
57
  // session_id_counter (structured)
58
58
  // * usually much smaller than 2**24
59
- // file_number (structured)
59
+ // orig_file_number (structured)
60
60
  // * usually smaller than 2**24
61
61
  // offset_in_file (structured, might skip lots of values)
62
62
  // * usually smaller than 2**32
63
- // max_offset determines placement of file_number to prevent
64
- // overlapping with offset
65
63
  //
66
- // Outputs come from bitwise-xor of the constituent pieces, low bits on left:
67
- //
68
- // |------------------------- session_etc64 -------------------------|
69
- // | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ |
64
+ // Overall approach (see https://github.com/pdillinger/unique_id for
65
+ // background):
66
+ //
67
+ // First, we have three "structured" values, up to 64 bits each, that we
68
+ // need to fit, without losses, into 128 bits. In practice, the values will
69
+ // be small enough that they should fit. For example, applications generating
70
+ // large SST files (large offsets) will naturally produce fewer files (small
71
+ // file numbers). But we don't know ahead of time what bounds the values will
72
+ // have.
73
+ //
74
+ // Second, we have unstructured inputs that enable distinct RocksDB processes
75
+ // to pick a random point in space, likely very different from others. Xoring
76
+ // the structured with the unstructured give us a cache key that is
77
+ // structurally distinct between related keys (e.g. same file or same RocksDB
78
+ // process) and distinct with high probability between unrelated keys.
79
+ //
80
+ // The problem of packing three structured values into the space for two is
81
+ // complicated by the fact that we want to derive cache keys from SST unique
82
+ // IDs, which have already combined structured and unstructured inputs in a
83
+ // practically inseparable way. And we want a base cache key that works
84
+ // with an offset of any size. So basically, we need to encode these three
85
+ // structured values, each up to 64 bits, into 128 bits without knowing any
86
+ // of their sizes. The DownwardInvolution() function gives us a mechanism to
87
+ // accomplish this. (See its properties in math.h.) Specifically, for inputs
88
+ // a, b, and c:
89
+ // lower64 = DownwardInvolution(a) ^ ReverseBits(b);
90
+ // upper64 = c ^ ReverseBits(a);
91
+ // The 128-bit output is unique assuming there exist some i, j, and k
92
+ // where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
93
+ // i + j + k <= 128. In other words, as long as there exist some bounds
94
+ // that would allow us to pack the bits of a, b, and c into the output
95
+ // if we know the bound, we can generate unique outputs without knowing
96
+ // those bounds. To validate this claim, the inversion function (given
97
+ // the bounds) has been implemented in CacheKeyDecoder in
98
+ // db_block_cache_test.cc.
99
+ //
100
+ // With that in mind, the outputs in terms of the conceptual inputs look
101
+ // like this, using bitwise-xor of the constituent pieces, low bits on left:
102
+ //
103
+ // |------------------------- file_num_etc64 -------------------------|
104
+ // | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
105
+ // |-----------------------------------------------------------------|
106
+ // | session_id_counter (involution) ..... | |
70
107
  // |-----------------------------------------------------------------|
71
- // | session_id_counter ...| |
108
+ // | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
109
+ // | * base_session_id (upper ~39 bits) |
110
+ // | * db_id (~122 bits entropy) |
72
111
  // |-----------------------------------------------------------------|
73
- // | | ... file_number |
74
- // | | overflow & meta |
112
+ // | | ..... orig_file_number (reversed) |
75
113
  // |-----------------------------------------------------------------|
76
114
  //
77
115
  //
78
116
  // |------------------------- offset_etc64 --------------------------|
79
- // | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
80
- // | * base_session_id (upper ~39 bits) |
81
- // | * db_id (~122 bits entropy) |
117
+ // | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
82
118
  // |-----------------------------------------------------------------|
83
- // | offset_in_file ............... | |
119
+ // | | ..... session_id_counter (reversed) |
84
120
  // |-----------------------------------------------------------------|
85
- // | | file_number, 0-3 |
86
- // | | lower bytes |
121
+ // | offset_in_file ............... | |
87
122
  // |-----------------------------------------------------------------|
88
123
  //
89
- // Based on max_offset, a maximal number of bytes 0..3 is chosen for
90
- // including from lower bits of file_number in offset_etc64. The choice
91
- // is encoded in two bits of metadata going into session_etc64, though
92
- // the common case of 3 bytes is encoded as 0 so that session_etc64
93
- // is unmodified by file_number concerns in the common case.
94
- //
95
- // There is nothing preventing "file number overflow & meta" from meeting
96
- // and overlapping with session_id_counter, but reaching such a case requires
97
- // an intractable combination of large file offsets (thus at least some large
98
- // files), large file numbers (thus large number of files generated), and
99
- // large number of session IDs generated in a single process. A trillion each
100
- // (2**40) of session ids, offsets, and file numbers comes to 120 bits.
101
- // With two bits of metadata and byte granularity, this is on the verge of
102
- // overlap, but even in the overlap case, it doesn't seem likely that
103
- // a file from billions of files or session ids ago will still be live
104
- // or cached.
105
- //
106
- // In fact, if our SST files are all < 4TB (see
107
- // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
108
- // in a single process are guaranteed to have unique cache keys, unless/until
109
- // number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
110
- // a single process and 64 trillion files generated. Even at that point, to
111
- // see a collision we would need a miraculous re-synchronization of session
112
- // id and file number, along with a live file or stale cache entry from
113
- // trillions of files ago.
114
- //
115
- // How https://github.com/pdillinger/unique_id applies here:
124
+ // Some oddities or inconveniences of this layout are due to deriving
125
+ // the "base" cache key (without offset) from the SST unique ID (see
126
+ // GetSstInternalUniqueId). Specifically,
127
+ // * Lower 64 of base_session_id occurs in both output words (ok but
128
+ // weird)
129
+ // * The inclusion of db_id is bad for the conditions under which we
130
+ // can guarantee uniqueness, but could be useful in some cases with
131
+ // few small files per process, to make up for db session id only having
132
+ // ~103 bits of entropy.
133
+ //
134
+ // In fact, if DB ids were not involved, we would be guaranteed unique
135
+ // cache keys for files generated in a single process until total bits for
136
+ // biggest session_id_counter, orig_file_number, and offset_in_file
137
+ // reach 128 bits.
138
+ //
139
+ // With the DB id limitation, we only have nice guaranteed unique cache
140
+ // keys for files generated in a single process until biggest
141
+ // session_id_counter and offset_in_file reach combined 64 bits. This
142
+ // is quite good in practice because we can have millions of DB Opens
143
+ // with terabyte size SST files, or billions of DB Opens with gigabyte
144
+ // size SST files.
145
+ //
146
+ // One of the considerations in the translation between existing SST unique
147
+ // IDs and base cache keys is supporting better SST unique IDs in a future
148
+ // format_version. If we use a process-wide file counter instead of
149
+ // session counter and file numbers, we only need to combine two 64-bit values
150
+ // instead of three. But we don't want to track unique ID versions in the
151
+ // manifest, so we want to keep the same translation layer between SST unique
152
+ // IDs and base cache keys, even with updated SST unique IDs. If the new
153
+ // unique IDs put the file counter where the orig_file_number was, and
154
+ // use no structured field where session_id_counter was, then our translation
155
+ // layer works fine for two structured fields as well as three (for
156
+ // compatibility). The small computation for the translation (one
157
+ // DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
158
+ // is negligible for computing as part of SST file reader open.
159
+ //
160
+ // More on how https://github.com/pdillinger/unique_id applies here:
116
161
  // Every bit of output always includes "unstructured" uniqueness bits and
117
162
  // often combines with "structured" uniqueness bits. The "unstructured" bits
118
163
  // change infrequently: only when we cannot guarantee our state tracking for
@@ -141,12 +186,11 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
141
186
  // 128 bits cache key size
142
187
  // - 55 <- ideal size for byte offsets + file numbers
143
188
  // - 2 <- bits for offsets and file numbers not exactly powers of two
144
- // - 2 <- bits for file number encoding metadata
145
189
  // + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
146
190
  // ----
147
- // 71 <- bits remaining for distinguishing session IDs
148
- // The probability of a collision in 71 bits of session ID data is less than
149
- // 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
191
+ // 73 <- bits remaining for distinguishing session IDs
192
+ // The probability of a collision in 73 bits of session ID data is less than
193
+ // 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
150
194
  // data from the last 180 days is in cache for potential collision, and that
151
195
  // cache keys under each session id exhaustively cover the remaining 57 bits
152
196
  // while in reality they'll only cover a small fraction of it.
@@ -160,7 +204,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
160
204
  // Now suppose we have many DBs per host, say 2**10, with same host-wide write
161
205
  // rate and process/session lifetime. File numbers will be ~10 bits smaller
162
206
  // and we will have 2**10 times as many session IDs because of simultaneous
163
- // lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)),
207
+ // lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
164
208
  // or roughly 1 in a billion.
165
209
  //
166
210
  // Suppose instead we generated random or hashed cache keys for each
@@ -176,17 +220,17 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
176
220
  // activity over many months, by making some pessimistic simplifying
177
221
  // assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
178
222
  // Here is some sample output with
179
- // `./cache_bench -stress_cache_key -sck_keep_bits=40`:
223
+ // `./cache_bench -stress_cache_key -sck_keep_bits=43`:
180
224
  //
181
225
  // Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
182
- // Multiply by 9.22337e+18 to correct for simulation losses (but still
226
+ // Multiply by 1.15292e+18 to correct for simulation losses (but still
183
227
  // assume whole file cached)
184
228
  //
185
229
  // These come from default settings of 2.5M files per day of 32 MB each, and
186
- // `-sck_keep_bits=40` means that to represent a single file, we are only
187
- // keeping 40 bits of the 128-bit (base) cache key. With file size of 2**25
188
- // contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or
189
- // about 9 billion billion times more prone to collision than reality.
230
+ // `-sck_keep_bits=43` means that to represent a single file, we are only
231
+ // keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25
232
+ // contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
233
+ // about 1 billion billion times more prone to collision than reality.
190
234
  //
191
235
  // More default assumptions, relatively pessimistic:
192
236
  // * 100 DBs in same process (doesn't matter much)
@@ -194,49 +238,55 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
194
238
  // average every 100 files generated
195
239
  // * Restart process (all new session IDs unrelated to old) 24 times per day
196
240
  //
197
- // After enough data, we get a result at the end (-sck_keep_bits=40):
241
+ // After enough data, we get a result at the end (-sck_keep_bits=43):
198
242
  //
199
- // (keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between
200
- // (9.76592e+19 corrected)
243
+ // (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between
244
+ // (1.15292e+19 corrected)
201
245
  //
202
246
  // If we believe the (pessimistic) simulation and the mathematical
203
- // extrapolation, we would need to run a billion machines all for 97 billion
247
+ // extrapolation, we would need to run a billion machines all for 11 billion
204
248
  // days to expect a cache key collision. To help verify that our extrapolation
205
- // ("corrected") is robust, we can make our simulation more precise with
206
- // `-sck_keep_bits=41` and `42`, which takes more running time to get enough
249
+ // ("corrected") is robust, we can make our simulation more precise by
250
+ // increasing the "keep" bits, which takes more running time to get enough
207
251
  // collision data:
208
252
  //
209
- // (keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between
210
- // (1.03763e+20 corrected)
211
- // (keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between
212
- // (1.09224e+20 corrected)
253
+ // (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between
254
+ // (1.6213e+19 corrected)
255
+ // (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between
256
+ // (1.21057e+19 corrected)
257
+ // (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between
258
+ // (1.46997e+19 corrected)
259
+ // (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between
260
+ // (2.11849e+19 corrected)
213
261
  //
214
- // The extrapolated prediction is very close. If anything, we might have some
215
- // very small losses of structured data (see class StressCacheKey in
216
- // cache_bench_tool.cc) leading to more accurate & more attractive prediction
217
- // with more bits kept.
262
+ // The extrapolated prediction seems to be within noise (sampling error).
218
263
  //
219
264
  // With the `-sck_randomize` option, we can see that typical workloads like
220
265
  // above have lower collision probability than "random" cache keys (note:
221
- // offsets still non-randomized) by a modest amount (roughly 20x less collision
222
- // prone than random), which should make us reasonably comfortable even in
223
- // "degenerate" cases (e.g. repeatedly launch a process to generate 1 file
224
- // with SstFileWriter):
266
+ // offsets still non-randomized) by a modest amount (roughly 2-3x less
267
+ // collision prone than random), which should make us reasonably comfortable
268
+ // even in "degenerate" cases (e.g. repeatedly launch a process to generate
269
+ // one file with SstFileWriter):
270
+ //
271
+ // (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
272
+ // (4.7165e+18 corrected)
273
+ //
274
+ // We can see that with more frequent process restarts,
275
+ // -sck_restarts_per_day=5000, which means more all-new session IDs, we get
276
+ // closer to the "random" cache key performance:
225
277
  //
226
- // (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between
227
- // (4.21372e+18 corrected)
278
+ // 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
228
279
  //
229
- // We can see that with more frequent process restarts (all new session IDs),
230
- // we get closer to the "random" cache key performance:
280
+ // And with less frequent process restarts and re-opens,
281
+ // -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
282
+ // probability:
231
283
  //
232
- // (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ...
233
- // (5.92931e+18 corrected)
284
+ // 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
234
285
  //
235
286
  // Other tests have been run to validate other conditions behave as expected,
236
287
  // never behaving "worse than random" unless we start chopping off structured
237
288
  // data.
238
289
  //
239
- //
240
290
  // Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
241
291
  // that only arise when a new process is started, the chance of any cache key
242
292
  // collisions in a giant fleet of machines is negligible. Especially when
@@ -249,96 +299,66 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
249
299
  // quantify) block cache corruptions, including collisions, should be added.
250
300
  OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
251
301
  const std::string &db_session_id,
252
- uint64_t file_number,
253
- uint64_t max_offset) {
254
- #ifndef NDEBUG
255
- max_offset_ = max_offset;
256
- #endif
257
- // Closely related to GetSstInternalUniqueId, but only need 128 bits and
258
- // need to include an offset within the file.
259
- // See also https://github.com/pdillinger/unique_id for background.
260
- uint64_t session_upper = 0; // Assignment to appease clang-analyze
261
- uint64_t session_lower = 0; // Assignment to appease clang-analyze
262
- {
263
- Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
264
- if (!s.ok()) {
265
- // A reasonable fallback in case malformed
266
- Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
267
- &session_lower);
268
- }
269
- }
270
-
271
- // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
272
- // for more global uniqueness entropy.
273
- // (It is possible that many DBs descended from one common DB id are copied
274
- // around and proliferate, in which case session id is critical, but it is
275
- // more common for different DBs to have different DB ids.)
276
- uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper);
277
-
278
- // This establishes the db+session id part of the cache key.
279
- //
280
- // Exactly preserve (in common cases; see modifiers below) session lower to
281
- // ensure that session ids generated during the same process lifetime are
282
- // guaranteed unique.
283
- //
284
- // We put this first for CommonPrefixSlice(), so that a small-ish set of
285
- // cache key prefixes to cover entries relevant to any DB.
286
- session_etc64_ = session_lower;
287
- // This provides extra entopy in case of different DB id or process
288
- // generating a session id, but is also partly/variably obscured by
289
- // file_number and offset (see below).
290
- offset_etc64_ = db_hash;
291
-
292
- // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and
293
- // a file_number, but we might need the file_number to overflow into
294
- // session_etc64_. (There must only be one session_etc64_ value per
295
- // file, and preferably shared among many files.)
296
- //
297
- // Figure out how many bytes of file_number we are going to be able to
298
- // pack in with max_offset, though our encoding will only support packing
299
- // in up to 3 bytes of file_number. (16M file numbers is enough for a new
300
- // file number every second for half a year.)
301
- int file_number_bytes_in_offset_etc =
302
- (63 - FloorLog2(max_offset | 0x100000000U)) / 8;
303
- int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8;
302
+ uint64_t file_number) {
303
+ UniqueId64x2 internal_id;
304
+ Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
305
+ &internal_id, /*force=*/true);
306
+ assert(s.ok());
307
+ *this = FromInternalUniqueId(&internal_id);
308
+ }
304
309
 
305
- // Assert two bits of metadata
306
- assert(file_number_bytes_in_offset_etc >= 0 &&
307
- file_number_bytes_in_offset_etc <= 3);
308
- // Assert we couldn't have used a larger allowed number of bytes (shift
309
- // would chop off bytes).
310
- assert(file_number_bytes_in_offset_etc == 3 ||
311
- (max_offset << (file_number_bits_in_offset_etc + 8) >>
312
- (file_number_bits_in_offset_etc + 8)) != max_offset);
310
+ OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
311
+ uint64_t session_lower = id.ptr[0];
312
+ uint64_t file_num_etc = id.ptr[1];
313
313
 
314
- uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1;
315
- // Pack into high bits of etc so that offset can go in low bits of etc
316
- // TODO: could be EndianSwapValue?
317
- uint64_t offset_etc_modifier = ReverseBits(file_number & mask);
318
- assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U);
314
+ #ifndef NDEBUG
315
+ bool is_empty = session_lower == 0 && file_num_etc == 0;
316
+ #endif
319
317
 
320
- // Overflow and 3 - byte count (likely both zero) go into session_id part
321
- uint64_t session_etc_modifier =
322
- (file_number >> file_number_bits_in_offset_etc << 2) |
323
- static_cast<uint64_t>(3 - file_number_bytes_in_offset_etc);
324
- // Packed into high bits to minimize interference with session id counter.
325
- session_etc_modifier = ReverseBits(session_etc_modifier);
318
+ // Although DBImpl guarantees (in recent versions) that session_lower is not
319
+ // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
320
+ // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
321
+ // However, if we are given an "empty" id as input, then we should produce
322
+ // "empty" as output.
323
+ // As a consequence, this function is only bijective assuming
324
+ // id[0] == 0 only if id[1] == 0.
325
+ if (session_lower == 0U) {
326
+ session_lower = file_num_etc;
327
+ }
326
328
 
327
- // Assert session_id part is only modified in extreme cases
328
- assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU ||
329
- max_offset > /*5 bytes*/ 0xffffffffffU);
329
+ // See comments above for how DownwardInvolution and ReverseBits
330
+ // make this function invertible under various assumptions.
331
+ OffsetableCacheKey rv;
332
+ rv.file_num_etc64_ =
333
+ DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
334
+ rv.offset_etc64_ = ReverseBits(session_lower);
330
335
 
331
- // Xor in the modifiers
332
- session_etc64_ ^= session_etc_modifier;
333
- offset_etc64_ ^= offset_etc_modifier;
336
+ // Because of these transformations and needing to allow arbitrary
337
+ // offset (thus, second 64 bits of cache key might be 0), we need to
338
+ // make some correction to ensure the first 64 bits is not 0.
339
+ // Fortunately, the transformation ensures the second 64 bits is not 0
340
+ // for non-empty base key, so we can swap in the case one is 0 without
341
+ // breaking bijectivity (assuming condition above).
342
+ assert(is_empty || rv.offset_etc64_ > 0);
343
+ if (rv.file_num_etc64_ == 0) {
344
+ std::swap(rv.file_num_etc64_, rv.offset_etc64_);
345
+ }
346
+ assert(is_empty || rv.file_num_etc64_ > 0);
347
+ return rv;
348
+ }
334
349
 
335
- // Although DBImpl guarantees (in recent versions) that session_lower is not
336
- // zero, that's not entirely sufficient to guarantee that session_etc64_ is
337
- // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
338
- if (session_etc64_ == 0U) {
339
- session_etc64_ = session_upper | 1U;
350
+ // Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
351
+ // offset_etc64 == 0)
352
+ UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
353
+ uint64_t a = file_num_etc64_;
354
+ uint64_t b = offset_etc64_;
355
+ if (b == 0) {
356
+ std::swap(a, b);
340
357
  }
341
- assert(session_etc64_ != 0);
358
+ UniqueId64x2 rv;
359
+ rv[0] = ReverseBits(b);
360
+ rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
361
+ return rv;
342
362
  }
343
363
 
344
364
  } // namespace ROCKSDB_NAMESPACE