npm - @nxtedition/rocksdb - Versions diffs - 7.1.5 → 7.1.8 - Mend

@nxtedition/rocksdb 7.1.5 → 7.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/binding.cc CHANGED Viewed

@@ -1477,7 +1477,7 @@ NAPI_METHOD(batch_clear) {
 }
 NAPI_METHOD(batch_write) {
-  NAPI_ARGV(3);
+  NAPI_ARGV(4);
   Database* database;
   NAPI_STATUS_THROWS(napi_get_value_external(env, argv[0], reinterpret_cast<void**>(&database)));
@@ -1485,8 +1485,34 @@ NAPI_METHOD(batch_write) {
   rocksdb::WriteBatch* batch;
   NAPI_STATUS_THROWS(napi_get_value_external(env, argv[1], reinterpret_cast<void**>(&batch)));
-  rocksdb::WriteOptions writeOptions;
-  ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, batch));
+  auto options = argv[2];
+  auto callback = argv[3];
+  std::optional<bool> sync;
+  NAPI_STATUS_THROWS(GetProperty(env, options, "sync", sync));
+  if (sync) {
+    napi_ref batchRef;
+    NAPI_STATUS_THROWS(napi_create_reference(env, argv[1], 1, &batchRef));
+    struct State {};
+    runAsync<State>(
+        "leveldown.batch.write", env, callback,
+        [=](auto& state) {
+          rocksdb::WriteOptions writeOptions;
+          writeOptions.sync = *sync;
+          return database->db->Write(writeOptions, batch);
+        },
+        [=](auto& state, auto env, auto& argv) { return napi_delete_reference(env, batchRef); });
+  } else {
+    rocksdb::WriteOptions writeOptions;
+    ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, batch));
+    napi_value global;
+    NAPI_STATUS_THROWS(napi_get_global(env, &global));
+    NAPI_STATUS_THROWS(napi_call_function(env, global, callback, 0, nullptr, nullptr));
+  }
   return 0;
 }
@@ -1563,10 +1589,7 @@ NAPI_METHOD(db_get_sorted_wal_files) {
   auto callback = argv[1];
   runAsync<rocksdb::VectorLogPtr>(
-      "leveldown.open", env, callback,
-      [=](auto& files) {
-        return database->db->GetSortedWalFiles(files);
-      },
+      "leveldown.open", env, callback, [=](auto& files) { return database->db->GetSortedWalFiles(files); },
       [=](auto& files, auto env, auto& argv) {
         argv.resize(2);
@@ -1620,13 +1643,8 @@ NAPI_METHOD(db_flush_wal) {
   auto callback = argv[2];
   runAsync<bool>(
-      "leveldown.open", env, callback,
-      [=](auto& state) {
-        return database->db->FlushWAL(sync);
-      },
-      [=](auto& state, auto env, auto& argv) {
-        return napi_ok;
-      });
+      "leveldown.flushWal", env, callback, [=](auto& state) { return database->db->FlushWAL(sync); },
+      [=](auto& state, auto env, auto& argv) { return napi_ok; });
   return 0;
 }

package/deps/rocksdb/rocksdb/cache/cache.cc CHANGED Viewed

@@ -33,6 +33,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
           OptionType::kDouble, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"low_pri_pool_ratio",
+         {offsetof(struct LRUCacheOptions, low_pri_pool_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
 };
 static std::unordered_map<std::string, OptionTypeInfo>

package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc CHANGED Viewed

@@ -304,7 +304,9 @@ class CacheBench {
           FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits,
           false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy);
     } else if (FLAGS_cache_type == "lru_cache") {
-      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5);
+      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
+                           false /* strict_capacity_limit */,
+                           0.5 /* high_pri_pool_ratio */);
 #ifndef ROCKSDB_LITE
       if (!FLAGS_secondary_cache_uri.empty()) {
         Status s = SecondaryCache::CreateFromString(
@@ -806,7 +808,6 @@ class StressCacheKey {
     uint64_t max_file_count =
         uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
-    uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
     uint32_t report_count = 0;
     uint32_t collisions_this_run = 0;
     size_t db_i = 0;
@@ -834,8 +835,7 @@ class StressCacheKey {
       }
       bool is_stable;
       BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
-                                         /* ignored */ 42, file_size, &ock,
-                                         &is_stable);
+                                         /* ignored */ 42, &ock, &is_stable);
       assert(is_stable);
       // Get a representative cache key, which later we analytically generalize
       // to a range.
@@ -845,13 +845,11 @@ class StressCacheKey {
         reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
       } else if (FLAGS_sck_footer_unique_id) {
         // Special case: keep only file number, not session counter
-        uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
-        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
-        reduced_key = (uint64_t{a} << 32) + b;
+        reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
       } else {
         // Try to keep file number and session counter (shift away other bits)
         uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
-        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
+        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
         reduced_key = (uint64_t{a} << 32) + b;
       }
       if (reduced_key == 0) {

package/deps/rocksdb/rocksdb/cache/cache_key.cc CHANGED Viewed

@@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE {
 // Value space plan for CacheKey:
 //
-// session_etc64_ | offset_etc64_ | Only generated by
+// file_num_etc64_ | offset_etc64_ | Only generated by
 // ---------------+---------------+------------------------------------------
 //              0 |             0 | Reserved for "empty" CacheKey()
 //              0 |  > 0, < 1<<63 | CreateUniqueForCacheLifetime
@@ -44,7 +44,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
   return CacheKey(0, id);
 }
-// Value plan for CacheKeys from OffsetableCacheKey, assuming that
+// How we generate CacheKeys and base OffsetableCacheKey, assuming that
 // db_session_ids are generated from a base_session_id and
 // session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
 // in DBImpl::GenerateDbSessionId):
@@ -56,63 +56,108 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 //   base_session_id         (unstructured, from GenerateRawUniqueId)
 //   session_id_counter      (structured)
 //                           * usually much smaller than 2**24
-//   file_number             (structured)
+//   orig_file_number        (structured)
 //                           * usually smaller than 2**24
 //   offset_in_file          (structured, might skip lots of values)
 //                           * usually smaller than 2**32
-//   max_offset              determines placement of file_number to prevent
-//                           overlapping with offset
 //
-// Outputs come from bitwise-xor of the constituent pieces, low bits on left:
-//
-// |------------------------- session_etc64 -------------------------|
-// | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ |
+// Overall approach (see https://github.com/pdillinger/unique_id for
+// background):
+//
+// First, we have three "structured" values, up to 64 bits each, that we
+// need to fit, without losses, into 128 bits. In practice, the values will
+// be small enough that they should fit. For example, applications generating
+// large SST files (large offsets) will naturally produce fewer files (small
+// file numbers). But we don't know ahead of time what bounds the values will
+// have.
+//
+// Second, we have unstructured inputs that enable distinct RocksDB processes
+// to pick a random point in space, likely very different from others. Xoring
+// the structured with the unstructured give us a cache key that is
+// structurally distinct between related keys (e.g. same file or same RocksDB
+// process) and distinct with high probability between unrelated keys.
+//
+// The problem of packing three structured values into the space for two is
+// complicated by the fact that we want to derive cache keys from SST unique
+// IDs, which have already combined structured and unstructured inputs in a
+// practically inseparable way. And we want a base cache key that works
+// with an offset of any size. So basically, we need to encode these three
+// structured values, each up to 64 bits, into 128 bits without knowing any
+// of their sizes. The DownwardInvolution() function gives us a mechanism to
+// accomplish this. (See its properties in math.h.) Specifically, for inputs
+// a, b, and c:
+//   lower64 = DownwardInvolution(a) ^ ReverseBits(b);
+//   upper64 = c ^ ReverseBits(a);
+// The 128-bit output is unique assuming there exist some i, j, and k
+// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
+// i + j + k <= 128. In other words, as long as there exist some bounds
+// that would allow us to pack the bits of a, b, and c into the output
+// if we know the bound, we can generate unique outputs without knowing
+// those bounds. To validate this claim, the inversion function (given
+// the bounds) has been implemented in CacheKeyDecoder in
+// db_block_cache_test.cc.
+//
+// With that in mind, the outputs in terms of the conceptual inputs look
+// like this, using bitwise-xor of the constituent pieces, low bits on left:
+//
+// |------------------------- file_num_etc64 -------------------------|
+// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
+// |-----------------------------------------------------------------|
+// | session_id_counter (involution) ..... |                         |
 // |-----------------------------------------------------------------|
-// | session_id_counter ...|                                         |
+// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
+// |  * base_session_id (upper ~39 bits)                             |
+// |  * db_id (~122 bits entropy)                                    |
 // |-----------------------------------------------------------------|
-// |                                               | ... file_number |
-// |                                               | overflow & meta |
+// |                             | ..... orig_file_number (reversed) |
 // |-----------------------------------------------------------------|
 //
 //
 // |------------------------- offset_etc64 --------------------------|
-// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
-// |  * base_session_id (upper ~39 bits)                             |
-// |  * db_id (~122 bits entropy)                                    |
+// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
 // |-----------------------------------------------------------------|
-// | offset_in_file ............... |                                |
+// |                           | ..... session_id_counter (reversed) |
 // |-----------------------------------------------------------------|
-// |                                              | file_number, 0-3 |
-// |                                              | lower bytes      |
+// | offset_in_file ............... |                                |
 // |-----------------------------------------------------------------|
 //
-// Based on max_offset, a maximal number of bytes 0..3 is chosen for
-// including from lower bits of file_number in offset_etc64. The choice
-// is encoded in two bits of metadata going into session_etc64, though
-// the common case of 3 bytes is encoded as 0 so that session_etc64
-// is unmodified by file_number concerns in the common case.
-//
-// There is nothing preventing "file number overflow & meta" from meeting
-// and overlapping with session_id_counter, but reaching such a case requires
-// an intractable combination of large file offsets (thus at least some large
-// files), large file numbers (thus large number of files generated), and
-// large number of session IDs generated in a single process. A trillion each
-// (2**40) of session ids, offsets, and file numbers comes to 120 bits.
-// With two bits of metadata and byte granularity, this is on the verge of
-// overlap, but even in the overlap case, it doesn't seem likely that
-// a file from billions of files or session ids ago will still be live
-// or cached.
-//
-// In fact, if our SST files are all < 4TB (see
-// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
-// in a single process are guaranteed to have unique cache keys, unless/until
-// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
-// a single process and 64 trillion files generated. Even at that point, to
-// see a collision we would need a miraculous re-synchronization of session
-// id and file number, along with a live file or stale cache entry from
-// trillions of files ago.
-//
-// How https://github.com/pdillinger/unique_id applies here:
+// Some oddities or inconveniences of this layout are due to deriving
+// the "base" cache key (without offset) from the SST unique ID (see
+// GetSstInternalUniqueId). Specifically,
+// * Lower 64 of base_session_id occurs in both output words (ok but
+//   weird)
+// * The inclusion of db_id is bad for the conditions under which we
+//   can guarantee uniqueness, but could be useful in some cases with
+//   few small files per process, to make up for db session id only having
+//   ~103 bits of entropy.
+//
+// In fact, if DB ids were not involved, we would be guaranteed unique
+// cache keys for files generated in a single process until total bits for
+// biggest session_id_counter, orig_file_number, and offset_in_file
+// reach 128 bits.
+//
+// With the DB id limitation, we only have nice guaranteed unique cache
+// keys for files generated in a single process until biggest
+// session_id_counter and offset_in_file reach combined 64 bits. This
+// is quite good in practice because we can have millions of DB Opens
+// with terabyte size SST files, or billions of DB Opens with gigabyte
+// size SST files.
+//
+// One of the considerations in the translation between existing SST unique
+// IDs and base cache keys is supporting better SST unique IDs in a future
+// format_version. If we use a process-wide file counter instead of
+// session counter and file numbers, we only need to combine two 64-bit values
+// instead of three. But we don't want to track unique ID versions in the
+// manifest, so we want to keep the same translation layer between SST unique
+// IDs and base cache keys, even with updated SST unique IDs. If the new
+// unique IDs put the file counter where the orig_file_number was, and
+// use no structured field where session_id_counter was, then our translation
+// layer works fine for two structured fields as well as three (for
+// compatibility). The small computation for the translation (one
+// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
+// is negligible for computing as part of SST file reader open.
+//
+// More on how https://github.com/pdillinger/unique_id applies here:
 // Every bit of output always includes "unstructured" uniqueness bits and
 // often combines with "structured" uniqueness bits. The "unstructured" bits
 // change infrequently: only when we cannot guarantee our state tracking for
@@ -141,12 +186,11 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // 128 bits cache key size
 // - 55 <- ideal size for byte offsets + file numbers
 // -  2 <- bits for offsets and file numbers not exactly powers of two
-// -  2 <- bits for file number encoding metadata
 // +  2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
 // ----
-//   71 <- bits remaining for distinguishing session IDs
-// The probability of a collision in 71 bits of session ID data is less than
-// 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
+//   73 <- bits remaining for distinguishing session IDs
+// The probability of a collision in 73 bits of session ID data is less than
+// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
 // data from the last 180 days is in cache for potential collision, and that
 // cache keys under each session id exhaustively cover the remaining 57 bits
 // while in reality they'll only cover a small fraction of it.
@@ -160,7 +204,7 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // Now suppose we have many DBs per host, say 2**10, with same host-wide write
 // rate and process/session lifetime. File numbers will be ~10 bits smaller
 // and we will have 2**10 times as many session IDs because of simultaneous
-// lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)),
+// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
 // or roughly 1 in a billion.
 //
 // Suppose instead we generated random or hashed cache keys for each
@@ -176,17 +220,17 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // activity over many months, by making some pessimistic simplifying
 // assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
 // Here is some sample output with
-// `./cache_bench -stress_cache_key -sck_keep_bits=40`:
+// `./cache_bench -stress_cache_key -sck_keep_bits=43`:
 //
 //   Total cache or DBs size: 32TiB  Writing 925.926 MiB/s or 76.2939TiB/day
-//   Multiply by 9.22337e+18 to correct for simulation losses (but still
+//   Multiply by 1.15292e+18 to correct for simulation losses (but still
 //   assume whole file cached)
 //
 // These come from default settings of 2.5M files per day of 32 MB each, and
-// `-sck_keep_bits=40` means that to represent a single file, we are only
-// keeping 40 bits of the 128-bit (base) cache key.  With file size of 2**25
-// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or
-// about 9 billion billion times more prone to collision than reality.
+// `-sck_keep_bits=43` means that to represent a single file, we are only
+// keeping 43 bits of the 128-bit (base) cache key.  With file size of 2**25
+// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
+// about 1 billion billion times more prone to collision than reality.
 //
 // More default assumptions, relatively pessimistic:
 // * 100 DBs in same process (doesn't matter much)
@@ -194,49 +238,55 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // average every 100 files generated
 // * Restart process (all new session IDs unrelated to old) 24 times per day
 //
-// After enough data, we get a result at the end (-sck_keep_bits=40):
+// After enough data, we get a result at the end (-sck_keep_bits=43):
 //
-//   (keep 40 bits)  17 collisions after 2 x 90 days, est 10.5882 days between
-//                   (9.76592e+19 corrected)
+//   (keep 43 bits)  18 collisions after 2 x 90 days, est 10 days between
+//                   (1.15292e+19 corrected)
 //
 // If we believe the (pessimistic) simulation and the mathematical
-// extrapolation, we would need to run a billion machines all for 97 billion
+// extrapolation, we would need to run a billion machines all for 11 billion
 // days to expect a cache key collision. To help verify that our extrapolation
-// ("corrected") is robust, we can make our simulation more precise with
-// `-sck_keep_bits=41` and `42`, which takes more running time to get enough
+// ("corrected") is robust, we can make our simulation more precise by
+// increasing the "keep" bits, which takes more running time to get enough
 // collision data:
 //
-//   (keep 41 bits)  16 collisions after 4 x 90 days, est 22.5 days between
-//                   (1.03763e+20 corrected)
-//   (keep 42 bits)  19 collisions after 10 x 90 days, est 47.3684 days between
-//                   (1.09224e+20 corrected)
+//   (keep 44 bits)  16 collisions after 5 x 90 days, est 28.125 days between
+//                   (1.6213e+19 corrected)
+//   (keep 45 bits)  15 collisions after 7 x 90 days, est 42 days between
+//                   (1.21057e+19 corrected)
+//   (keep 46 bits)  15 collisions after 17 x 90 days, est 102 days between
+//                   (1.46997e+19 corrected)
+//   (keep 47 bits)  15 collisions after 49 x 90 days, est 294 days between
+//                   (2.11849e+19 corrected)
 //
-// The extrapolated prediction is very close. If anything, we might have some
-// very small losses of structured data (see class StressCacheKey in
-// cache_bench_tool.cc) leading to more accurate & more attractive prediction
-// with more bits kept.
+// The extrapolated prediction seems to be within noise (sampling error).
 //
 // With the `-sck_randomize` option, we can see that typical workloads like
 // above have lower collision probability than "random" cache keys (note:
-// offsets still non-randomized) by a modest amount (roughly 20x less collision
-// prone than random), which should make us reasonably comfortable even in
-// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file
-// with SstFileWriter):
+// offsets still non-randomized) by a modest amount (roughly 2-3x less
+// collision prone than random), which should make us reasonably comfortable
+// even in "degenerate" cases (e.g. repeatedly launch a process to generate
+// one file with SstFileWriter):
+//
+//   (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
+//                  (4.7165e+18 corrected)
+//
+// We can see that with more frequent process restarts,
+// -sck_restarts_per_day=5000, which means more all-new session IDs, we get
+// closer to the "random" cache key performance:
 //
-//   (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between
-//                  (4.21372e+18 corrected)
+// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
 //
-// We can see that with more frequent process restarts (all new session IDs),
-// we get closer to the "random" cache key performance:
+// And with less frequent process restarts and re-opens,
+// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
+// probability:
 //
-//   (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ...
-//                  (5.92931e+18 corrected)
+// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
 //
 // Other tests have been run to validate other conditions behave as expected,
 // never behaving "worse than random" unless we start chopping off structured
 // data.
 //
-//
 // Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
 // that only arise when a new process is started, the chance of any cache key
 // collisions in a giant fleet of machines is negligible. Especially when
@@ -249,96 +299,66 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 // quantify) block cache corruptions, including collisions, should be added.
 OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
                                        const std::string &db_session_id,
-                                       uint64_t file_number,
-                                       uint64_t max_offset) {
-#ifndef NDEBUG
-  max_offset_ = max_offset;
-#endif
-  // Closely related to GetSstInternalUniqueId, but only need 128 bits and
-  // need to include an offset within the file.
-  // See also https://github.com/pdillinger/unique_id for background.
-  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
-  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
-  {
-    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
-    if (!s.ok()) {
-      // A reasonable fallback in case malformed
-      Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
-               &session_lower);
-    }
-  }
-  // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
-  // for more global uniqueness entropy.
-  // (It is possible that many DBs descended from one common DB id are copied
-  // around and proliferate, in which case session id is critical, but it is
-  // more common for different DBs to have different DB ids.)
-  uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper);
-  // This establishes the db+session id part of the cache key.
-  //
-  // Exactly preserve (in common cases; see modifiers below) session lower to
-  // ensure that session ids generated during the same process lifetime are
-  // guaranteed unique.
-  //
-  // We put this first for CommonPrefixSlice(), so that a small-ish set of
-  // cache key prefixes to cover entries relevant to any DB.
-  session_etc64_ = session_lower;
-  // This provides extra entopy in case of different DB id or process
-  // generating a session id, but is also partly/variably obscured by
-  // file_number and offset (see below).
-  offset_etc64_ = db_hash;
-  // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and
-  // a file_number, but we might need the file_number to overflow into
-  // session_etc64_. (There must only be one session_etc64_ value per
-  // file, and preferably shared among many files.)
-  //
-  // Figure out how many bytes of file_number we are going to be able to
-  // pack in with max_offset, though our encoding will only support packing
-  // in up to 3 bytes of file_number. (16M file numbers is enough for a new
-  // file number every second for half a year.)
-  int file_number_bytes_in_offset_etc =
-      (63 - FloorLog2(max_offset | 0x100000000U)) / 8;
-  int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8;
+                                       uint64_t file_number) {
+  UniqueId64x2 internal_id;
+  Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
+                                    &internal_id, /*force=*/true);
+  assert(s.ok());
+  *this = FromInternalUniqueId(&internal_id);
+}
-  // Assert two bits of metadata
-  assert(file_number_bytes_in_offset_etc >= 0 &&
-         file_number_bytes_in_offset_etc <= 3);
-  // Assert we couldn't have used a larger allowed number of bytes (shift
-  // would chop off bytes).
-  assert(file_number_bytes_in_offset_etc == 3 ||
-         (max_offset << (file_number_bits_in_offset_etc + 8) >>
-          (file_number_bits_in_offset_etc + 8)) != max_offset);
+OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
+  uint64_t session_lower = id.ptr[0];
+  uint64_t file_num_etc = id.ptr[1];
-  uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1;
-  // Pack into high bits of etc so that offset can go in low bits of etc
-  // TODO: could be EndianSwapValue?
-  uint64_t offset_etc_modifier = ReverseBits(file_number & mask);
-  assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U);
+#ifndef NDEBUG
+  bool is_empty = session_lower == 0 && file_num_etc == 0;
+#endif
-  // Overflow and 3 - byte count (likely both zero) go into session_id part
-  uint64_t session_etc_modifier =
-      (file_number >> file_number_bits_in_offset_etc << 2) |
-      static_cast<uint64_t>(3 - file_number_bytes_in_offset_etc);
-  // Packed into high bits to minimize interference with session id counter.
-  session_etc_modifier = ReverseBits(session_etc_modifier);
+  // Although DBImpl guarantees (in recent versions) that session_lower is not
+  // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
+  // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
+  // However, if we are given an "empty" id as input, then we should produce
+  // "empty" as output.
+  // As a consequence, this function is only bijective assuming
+  // id[0] == 0 only if id[1] == 0.
+  if (session_lower == 0U) {
+    session_lower = file_num_etc;
+  }
-  // Assert session_id part is only modified in extreme cases
-  assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU ||
-         max_offset > /*5 bytes*/ 0xffffffffffU);
+  // See comments above for how DownwardInvolution and ReverseBits
+  // make this function invertible under various assumptions.
+  OffsetableCacheKey rv;
+  rv.file_num_etc64_ =
+      DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
+  rv.offset_etc64_ = ReverseBits(session_lower);
-  // Xor in the modifiers
-  session_etc64_ ^= session_etc_modifier;
-  offset_etc64_ ^= offset_etc_modifier;
+  // Because of these transformations and needing to allow arbitrary
+  // offset (thus, second 64 bits of cache key might be 0), we need to
+  // make some correction to ensure the first 64 bits is not 0.
+  // Fortunately, the transformation ensures the second 64 bits is not 0
+  // for non-empty base key, so we can swap in the case one is 0 without
+  // breaking bijectivity (assuming condition above).
+  assert(is_empty || rv.offset_etc64_ > 0);
+  if (rv.file_num_etc64_ == 0) {
+    std::swap(rv.file_num_etc64_, rv.offset_etc64_);
+  }
+  assert(is_empty || rv.file_num_etc64_ > 0);
+  return rv;
+}
-  // Although DBImpl guarantees (in recent versions) that session_lower is not
-  // zero, that's not entirely sufficient to guarantee that session_etc64_ is
-  // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
-  if (session_etc64_ == 0U) {
-    session_etc64_ = session_upper | 1U;
+// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
+// offset_etc64 == 0)
+UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
+  uint64_t a = file_num_etc64_;
+  uint64_t b = offset_etc64_;
+  if (b == 0) {
+    std::swap(a, b);
   }
-  assert(session_etc64_ != 0);
+  UniqueId64x2 rv;
+  rv[0] = ReverseBits(b);
+  rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
+  return rv;
 }
 }  // namespace ROCKSDB_NAMESPACE