@nxtedition/rocksdb 7.1.11 → 7.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/binding.cc +50 -46
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  3. package/deps/rocksdb/rocksdb/TARGETS +2 -0
  4. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +60 -17
  5. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +4 -4
  6. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +81 -37
  7. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +6 -0
  8. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -6
  9. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +10 -8
  10. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -9
  11. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +3 -3
  12. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +69 -0
  13. package/deps/rocksdb/rocksdb/db/flush_job.cc +6 -6
  14. package/deps/rocksdb/rocksdb/db/memtable.cc +19 -7
  15. package/deps/rocksdb/rocksdb/db/memtable.h +8 -16
  16. package/deps/rocksdb/rocksdb/db/memtable_list.cc +27 -16
  17. package/deps/rocksdb/rocksdb/db/memtable_list.h +18 -11
  18. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +70 -55
  19. package/deps/rocksdb/rocksdb/db/table_cache.cc +9 -11
  20. package/deps/rocksdb/rocksdb/db/table_cache.h +2 -1
  21. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +3 -3
  22. package/deps/rocksdb/rocksdb/db/version_set.cc +530 -257
  23. package/deps/rocksdb/rocksdb/db/version_set.h +32 -2
  24. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  25. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +64 -12
  26. package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +18 -0
  27. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +8 -0
  28. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +13 -1
  29. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -0
  30. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +83 -0
  31. package/deps/rocksdb/rocksdb/options/options.cc +4 -2
  32. package/deps/rocksdb/rocksdb/src.mk +1 -0
  33. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -10
  34. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +5 -4
  35. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +10 -28
  36. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +4 -4
  37. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +11 -9
  38. package/deps/rocksdb/rocksdb/table/get_context.cc +34 -22
  39. package/deps/rocksdb/rocksdb/table/get_context.h +6 -3
  40. package/deps/rocksdb/rocksdb/table/multiget_context.h +69 -5
  41. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -2
  42. package/deps/rocksdb/rocksdb/table/table_test.cc +8 -8
  43. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +23 -0
  44. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +27 -7
  45. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +8 -4
  46. package/deps/rocksdb/rocksdb.gyp +1 -0
  47. package/index.js +20 -11
  48. package/package.json +1 -1
  49. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  50. package/prebuilds/darwin-x64/node.napi.node +0 -0
  51. package/prebuilds/linux-x64/node.napi.node +0 -0
  52. package/util.h +14 -12
package/binding.cc CHANGED
@@ -688,14 +688,15 @@ NAPI_METHOD(db_open) {
688
688
 
689
689
  dbOptions.avoid_unnecessary_blocking_io = true;
690
690
  dbOptions.write_dbid_to_manifest = true;
691
- dbOptions.use_adaptive_mutex = true; // We don't have soo many threads in the libuv thread pool...
692
- dbOptions.enable_pipelined_write = false; // We only write in the main thread...
691
+ dbOptions.use_adaptive_mutex = true; // We don't have soo many threads in the libuv thread pool...
692
+ dbOptions.enable_pipelined_write = true; // We only write in the main thread...
693
693
  dbOptions.create_missing_column_families = true;
694
694
  dbOptions.fail_if_options_file_error = true;
695
695
 
696
696
  NAPI_STATUS_THROWS(GetProperty(env, options, "createIfMissing", dbOptions.create_if_missing));
697
697
  NAPI_STATUS_THROWS(GetProperty(env, options, "errorIfExists", dbOptions.error_if_exists));
698
698
  NAPI_STATUS_THROWS(GetProperty(env, options, "unorderedWrite", dbOptions.unordered_write));
699
+ NAPI_STATUS_THROWS(GetProperty(env, options, "pipelinedWrite", dbOptions.enable_pipelined_write));
699
700
 
700
701
  // TODO (feat): dbOptions.listeners
701
702
 
@@ -1331,14 +1332,19 @@ NAPI_METHOD(iterator_nextv) {
1331
1332
  }
1332
1333
 
1333
1334
  NAPI_METHOD(batch_do) {
1334
- NAPI_ARGV(3);
1335
+ NAPI_ARGV(4);
1335
1336
 
1336
1337
  Database* database;
1337
1338
  NAPI_STATUS_THROWS(napi_get_value_external(env, argv[0], reinterpret_cast<void**>(&database)));
1338
1339
 
1339
- rocksdb::WriteBatch batch;
1340
+ const auto elements = argv[1];
1341
+ const auto options = argv[2];
1342
+ const auto callback = argv[3];
1343
+
1344
+ bool sync = false;
1345
+ NAPI_STATUS_THROWS(GetProperty(env, options, "sync", sync));
1340
1346
 
1341
- auto elements = argv[1];
1347
+ auto batch = std::make_unique<rocksdb::WriteBatch>();
1342
1348
 
1343
1349
  uint32_t length;
1344
1350
  NAPI_STATUS_THROWS(napi_get_array_length(env, elements, &length));
@@ -1358,25 +1364,39 @@ NAPI_METHOD(batch_do) {
1358
1364
 
1359
1365
  if (type == "del") {
1360
1366
  NAPI_STATUS_THROWS(GetProperty(env, element, "key", key, true));
1361
- ROCKS_STATUS_THROWS_NAPI(batch.Delete(column, key));
1367
+ ROCKS_STATUS_THROWS_NAPI(batch->Delete(column, key));
1362
1368
  } else if (type == "put") {
1363
1369
  NAPI_STATUS_THROWS(GetProperty(env, element, "key", key, true));
1364
1370
  NAPI_STATUS_THROWS(GetProperty(env, element, "value", value, true));
1365
- ROCKS_STATUS_THROWS_NAPI(batch.Put(column, key, value));
1371
+ ROCKS_STATUS_THROWS_NAPI(batch->Put(column, key, value));
1366
1372
  } else if (type == "data") {
1367
1373
  NAPI_STATUS_THROWS(GetProperty(env, element, "value", value, true));
1368
- ROCKS_STATUS_THROWS_NAPI(batch.PutLogData(value));
1374
+ ROCKS_STATUS_THROWS_NAPI(batch->PutLogData(value));
1369
1375
  } else if (type == "merge") {
1370
1376
  NAPI_STATUS_THROWS(GetProperty(env, element, "key", key, true));
1371
1377
  NAPI_STATUS_THROWS(GetProperty(env, element, "value", value, true));
1372
- ROCKS_STATUS_THROWS_NAPI(batch.Merge(column, key, value));
1378
+ ROCKS_STATUS_THROWS_NAPI(batch->Merge(column, key, value));
1373
1379
  } else {
1374
1380
  NAPI_STATUS_THROWS(napi_invalid_arg);
1375
1381
  }
1376
1382
  }
1377
1383
 
1378
- rocksdb::WriteOptions writeOptions;
1379
- ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, &batch));
1384
+ runAsync<int64_t>(
1385
+ "leveldown.batch.do", env, callback,
1386
+ [=, batch = std::move(batch)](int64_t& seq) {
1387
+ rocksdb::WriteOptions writeOptions;
1388
+ writeOptions.sync = sync;
1389
+
1390
+ // TODO (fix): Better way to get batch sequence?
1391
+ seq = database->db->GetLatestSequenceNumber() + 1;
1392
+
1393
+ return database->db->Write(writeOptions, batch.get());
1394
+ },
1395
+ [=](int64_t& seq, auto env, auto& argv) {
1396
+ argv.resize(2);
1397
+ NAPI_STATUS_RETURN(napi_create_int64(env, seq, &argv[1]));
1398
+ return napi_ok;
1399
+ });
1380
1400
 
1381
1401
  return 0;
1382
1402
  }
@@ -1443,7 +1463,7 @@ NAPI_METHOD(batch_merge) {
1443
1463
  NAPI_ARGV(4);
1444
1464
 
1445
1465
  rocksdb::WriteBatch* batch;
1446
- NAPI_STATUS_THROWS(napi_get_value_external(env, argv[0], (void**)(&batch)));
1466
+ NAPI_STATUS_THROWS(napi_get_value_external(env, argv[0], reinterpret_cast<void**>(&batch)));
1447
1467
 
1448
1468
  NapiSlice key;
1449
1469
  NAPI_STATUS_THROWS(GetValue(env, argv[1], key));
@@ -1488,45 +1508,29 @@ NAPI_METHOD(batch_write) {
1488
1508
  auto options = argv[2];
1489
1509
  auto callback = argv[3];
1490
1510
 
1491
- std::optional<bool> sync;
1511
+ bool sync = false;
1492
1512
  NAPI_STATUS_THROWS(GetProperty(env, options, "sync", sync));
1493
1513
 
1494
- if (sync) {
1495
- napi_ref batchRef;
1496
- NAPI_STATUS_THROWS(napi_create_reference(env, argv[1], 1, &batchRef));
1497
-
1498
- runAsync<int64_t>(
1499
- "leveldown.batch.write", env, callback,
1500
- [=](int64_t& seq) {
1501
- rocksdb::WriteOptions writeOptions;
1502
- writeOptions.sync = *sync;
1503
-
1504
- // TODO (fix): Better way to get final batch sequence?
1505
- seq = database->db->GetLatestSequenceNumber() + 1;
1506
-
1507
- return database->db->Write(writeOptions, batch);
1508
- },
1509
- [=](int64_t& seq, auto env, auto& argv) {
1510
- argv.resize(2);
1511
- NAPI_STATUS_RETURN(napi_delete_reference(env, batchRef));
1512
- NAPI_STATUS_RETURN(napi_create_int64(env, seq, &argv[1]));
1513
- return napi_ok;
1514
- });
1515
- } else {
1516
- // TODO (fix): Better way to get final batch sequence?
1517
- auto seq = database->db->GetLatestSequenceNumber() + 1;
1518
-
1519
- napi_value result;
1520
- NAPI_STATUS_THROWS(napi_create_int64(env, seq, &result));
1514
+ napi_ref batchRef;
1515
+ NAPI_STATUS_THROWS(napi_create_reference(env, argv[1], 1, &batchRef));
1521
1516
 
1522
- rocksdb::WriteOptions writeOptions;
1523
- ROCKS_STATUS_THROWS_NAPI(database->db->Write(writeOptions, batch));
1517
+ runAsync<int64_t>(
1518
+ "leveldown.batch.write", env, callback,
1519
+ [=](int64_t& seq) {
1520
+ rocksdb::WriteOptions writeOptions;
1521
+ writeOptions.sync = sync;
1524
1522
 
1525
- napi_value global;
1526
- NAPI_STATUS_THROWS(napi_get_global(env, &global));
1523
+ // TODO (fix): Better way to get batch sequence?
1524
+ seq = database->db->GetLatestSequenceNumber() + 1;
1527
1525
 
1528
- NAPI_STATUS_THROWS(napi_call_function(env, global, callback, 1, &result, nullptr));
1529
- }
1526
+ return database->db->Write(writeOptions, batch);
1527
+ },
1528
+ [=](int64_t& seq, auto env, auto& argv) {
1529
+ argv.resize(2);
1530
+ NAPI_STATUS_RETURN(napi_delete_reference(env, batchRef));
1531
+ NAPI_STATUS_RETURN(napi_create_int64(env, seq, &argv[1]));
1532
+ return napi_ok;
1533
+ });
1530
1534
 
1531
1535
  return 0;
1532
1536
  }
@@ -684,6 +684,7 @@ set(SOURCES
684
684
  db/wal_edit.cc
685
685
  db/wal_manager.cc
686
686
  db/wide/wide_column_serialization.cc
687
+ db/wide/wide_columns.cc
687
688
  db/write_batch.cc
688
689
  db/write_batch_base.cc
689
690
  db/write_controller.cc
@@ -1510,7 +1511,7 @@ if(WITH_BENCHMARK_TOOLS)
1510
1511
  endif()
1511
1512
 
1512
1513
  option(WITH_TRACE_TOOLS "build with trace tools" ON)
1513
- if(WITH_TRACE_TOOLS)
1514
+ if(WITH_TRACE_TOOLS)
1514
1515
  add_executable(block_cache_trace_analyzer_tool${ARTIFACT_SUFFIX}
1515
1516
  tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc)
1516
1517
  target_link_libraries(block_cache_trace_analyzer_tool${ARTIFACT_SUFFIX}
@@ -99,6 +99,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
99
99
  "db/wal_edit.cc",
100
100
  "db/wal_manager.cc",
101
101
  "db/wide/wide_column_serialization.cc",
102
+ "db/wide/wide_columns.cc",
102
103
  "db/write_batch.cc",
103
104
  "db/write_batch_base.cc",
104
105
  "db/write_controller.cc",
@@ -435,6 +436,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
435
436
  "db/wal_edit.cc",
436
437
  "db/wal_manager.cc",
437
438
  "db/wide/wide_column_serialization.cc",
439
+ "db/wide/wide_columns.cc",
438
440
  "db/write_batch.cc",
439
441
  "db/write_batch_base.cc",
440
442
  "db/write_controller.cc",
@@ -2013,12 +2013,10 @@ TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) {
2013
2013
  }
2014
2014
 
2015
2015
  TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) {
2016
- #ifndef USE_COROUTINES
2017
2016
  if (std::get<1>(GetParam())) {
2018
- ROCKSDB_GTEST_SKIP("This test requires coroutine support");
2017
+ ROCKSDB_GTEST_SKIP("This test needs to be fixed for async IO");
2019
2018
  return;
2020
2019
  }
2021
- #endif // USE_COROUTINES
2022
2020
  // Skip for unbatched MultiGet
2023
2021
  if (!std::get<0>(GetParam())) {
2024
2022
  ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
@@ -2131,7 +2129,8 @@ INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
2131
2129
  testing::Combine(testing::Bool(), testing::Bool()));
2132
2130
 
2133
2131
  #if USE_COROUTINES
2134
- class DBMultiGetAsyncIOTest : public DBBasicTest {
2132
+ class DBMultiGetAsyncIOTest : public DBBasicTest,
2133
+ public ::testing::WithParamInterface<bool> {
2135
2134
  public:
2136
2135
  DBMultiGetAsyncIOTest()
2137
2136
  : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) {
@@ -2210,7 +2209,7 @@ class DBMultiGetAsyncIOTest : public DBBasicTest {
2210
2209
  std::shared_ptr<Statistics> statistics_;
2211
2210
  };
2212
2211
 
2213
- TEST_F(DBMultiGetAsyncIOTest, GetFromL0) {
2212
+ TEST_P(DBMultiGetAsyncIOTest, GetFromL0) {
2214
2213
  // All 3 keys in L0. The L0 files should be read serially.
2215
2214
  std::vector<std::string> key_strs{Key(0), Key(40), Key(80)};
2216
2215
  std::vector<Slice> keys{key_strs[0], key_strs[1], key_strs[2]};
@@ -2219,6 +2218,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL0) {
2219
2218
 
2220
2219
  ReadOptions ro;
2221
2220
  ro.async_io = true;
2221
+ ro.optimize_multiget_for_io = GetParam();
2222
2222
  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2223
2223
  keys.data(), values.data(), statuses.data());
2224
2224
  ASSERT_EQ(values.size(), 3);
@@ -2233,13 +2233,17 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL0) {
2233
2233
 
2234
2234
  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
2235
2235
 
2236
- // No async IO in this case since we don't do parallel lookup in L0
2237
- ASSERT_EQ(multiget_io_batch_size.count, 0);
2238
- ASSERT_EQ(multiget_io_batch_size.max, 0);
2239
- ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
2236
+ // With async IO, lookups will happen in parallel for each key
2237
+ if (GetParam()) {
2238
+ ASSERT_EQ(multiget_io_batch_size.count, 1);
2239
+ ASSERT_EQ(multiget_io_batch_size.max, 3);
2240
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
2241
+ } else {
2242
+ ASSERT_EQ(multiget_io_batch_size.count, 0);
2243
+ }
2240
2244
  }
2241
2245
 
2242
- TEST_F(DBMultiGetAsyncIOTest, GetFromL1) {
2246
+ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) {
2243
2247
  std::vector<std::string> key_strs;
2244
2248
  std::vector<Slice> keys;
2245
2249
  std::vector<PinnableSlice> values;
@@ -2256,6 +2260,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL1) {
2256
2260
 
2257
2261
  ReadOptions ro;
2258
2262
  ro.async_io = true;
2263
+ ro.optimize_multiget_for_io = GetParam();
2259
2264
  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2260
2265
  keys.data(), values.data(), statuses.data());
2261
2266
  ASSERT_EQ(values.size(), 3);
@@ -2276,7 +2281,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL1) {
2276
2281
  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
2277
2282
  }
2278
2283
 
2279
- TEST_F(DBMultiGetAsyncIOTest, LastKeyInFile) {
2284
+ TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) {
2280
2285
  std::vector<std::string> key_strs;
2281
2286
  std::vector<Slice> keys;
2282
2287
  std::vector<PinnableSlice> values;
@@ -2294,6 +2299,7 @@ TEST_F(DBMultiGetAsyncIOTest, LastKeyInFile) {
2294
2299
 
2295
2300
  ReadOptions ro;
2296
2301
  ro.async_io = true;
2302
+ ro.optimize_multiget_for_io = GetParam();
2297
2303
  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2298
2304
  keys.data(), values.data(), statuses.data());
2299
2305
  ASSERT_EQ(values.size(), 3);
@@ -2316,7 +2322,7 @@ TEST_F(DBMultiGetAsyncIOTest, LastKeyInFile) {
2316
2322
  ASSERT_EQ(multiget_io_batch_size.max, 2);
2317
2323
  }
2318
2324
 
2319
- TEST_F(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
2325
+ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
2320
2326
  std::vector<std::string> key_strs;
2321
2327
  std::vector<Slice> keys;
2322
2328
  std::vector<PinnableSlice> values;
@@ -2334,6 +2340,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
2334
2340
 
2335
2341
  ReadOptions ro;
2336
2342
  ro.async_io = true;
2343
+ ro.optimize_multiget_for_io = GetParam();
2337
2344
  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2338
2345
  keys.data(), values.data(), statuses.data());
2339
2346
  ASSERT_EQ(values.size(), 3);
@@ -2348,13 +2355,13 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
2348
2355
 
2349
2356
  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
2350
2357
 
2351
- // There is only one MultiGet key in the bottommost level - 56. Thus
2352
- // the bottommost level will not use async IO.
2358
+ // There are 2 keys in L1 in twp separate files, and 1 in L2. With
2359
+ // async IO, all three lookups will happen in parallel
2353
2360
  ASSERT_EQ(multiget_io_batch_size.count, 1);
2354
- ASSERT_EQ(multiget_io_batch_size.max, 2);
2361
+ ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2);
2355
2362
  }
2356
2363
 
2357
- TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
2364
+ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
2358
2365
  std::vector<std::string> key_strs;
2359
2366
  std::vector<Slice> keys;
2360
2367
  std::vector<PinnableSlice> values;
@@ -2370,6 +2377,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
2370
2377
 
2371
2378
  ReadOptions ro;
2372
2379
  ro.async_io = true;
2380
+ ro.optimize_multiget_for_io = GetParam();
2373
2381
  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2374
2382
  keys.data(), values.data(), statuses.data());
2375
2383
  ASSERT_EQ(values.size(), 2);
@@ -2382,7 +2390,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
2382
2390
  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
2383
2391
  }
2384
2392
 
2385
- TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
2393
+ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
2386
2394
  std::vector<std::string> key_strs;
2387
2395
  std::vector<Slice> keys;
2388
2396
  std::vector<PinnableSlice> values;
@@ -2398,6 +2406,7 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
2398
2406
 
2399
2407
  ReadOptions ro;
2400
2408
  ro.async_io = true;
2409
+ ro.optimize_multiget_for_io = GetParam();
2401
2410
  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2402
2411
  keys.data(), values.data(), statuses.data());
2403
2412
  ASSERT_EQ(values.size(), 2);
@@ -2407,6 +2416,40 @@ TEST_F(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
2407
2416
  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
2408
2417
  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
2409
2418
  }
2419
+
2420
+ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) {
2421
+ std::vector<std::string> key_strs;
2422
+ std::vector<Slice> keys;
2423
+ std::vector<PinnableSlice> values;
2424
+ std::vector<Status> statuses;
2425
+
2426
+ // 139 and 163 are in L2, but overlap with a range deletes in L1
2427
+ key_strs.push_back(Key(139));
2428
+ key_strs.push_back(Key(144));
2429
+ key_strs.push_back(Key(163));
2430
+ keys.push_back(key_strs[0]);
2431
+ keys.push_back(key_strs[1]);
2432
+ keys.push_back(key_strs[2]);
2433
+ values.resize(keys.size());
2434
+ statuses.resize(keys.size());
2435
+
2436
+ ReadOptions ro;
2437
+ ro.async_io = true;
2438
+ ro.optimize_multiget_for_io = GetParam();
2439
+ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
2440
+ keys.data(), values.data(), statuses.data());
2441
+ ASSERT_EQ(values.size(), keys.size());
2442
+ ASSERT_EQ(statuses[0], Status::NotFound());
2443
+ ASSERT_EQ(statuses[1], Status::OK());
2444
+ ASSERT_EQ(values[1], "val_l1_" + std::to_string(144));
2445
+ ASSERT_EQ(statuses[2], Status::NotFound());
2446
+
2447
+ // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
2448
+ ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
2449
+ }
2450
+
2451
+ INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest,
2452
+ testing::Bool());
2410
2453
  #endif // USE_COROUTINES
2411
2454
 
2412
2455
  TEST_F(DBBasicTest, MultiGetStats) {
@@ -72,9 +72,9 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
72
72
  user_comparator_->timestamp_size() > 0 ? timestamp : nullptr;
73
73
  LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
74
74
  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
75
- GetContext::kNotFound, lkey.user_key(), value, ts,
76
- nullptr, nullptr, true, nullptr, nullptr, nullptr,
77
- nullptr, &read_cb);
75
+ GetContext::kNotFound, lkey.user_key(), value,
76
+ /*columns=*/nullptr, ts, nullptr, nullptr, true,
77
+ nullptr, nullptr, nullptr, nullptr, &read_cb);
78
78
 
79
79
  const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
80
80
  if (user_comparator_->CompareWithoutTimestamp(
@@ -159,7 +159,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(
159
159
  std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr;
160
160
  GetContext get_context(
161
161
  user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound,
162
- lkey.user_key(), &pinnable_val,
162
+ lkey.user_key(), &pinnable_val, /*columns=*/nullptr,
163
163
  user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr,
164
164
  nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb);
165
165
  Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
@@ -1822,6 +1822,28 @@ Status DBImpl::Get(const ReadOptions& read_options,
1822
1822
  return s;
1823
1823
  }
1824
1824
 
1825
+ Status DBImpl::GetEntity(const ReadOptions& read_options,
1826
+ ColumnFamilyHandle* column_family, const Slice& key,
1827
+ PinnableWideColumns* columns) {
1828
+ if (!column_family) {
1829
+ return Status::InvalidArgument(
1830
+ "Cannot call GetEntity without a column family handle");
1831
+ }
1832
+
1833
+ if (!columns) {
1834
+ return Status::InvalidArgument(
1835
+ "Cannot call GetEntity without a PinnableWideColumns object");
1836
+ }
1837
+
1838
+ columns->Reset();
1839
+
1840
+ GetImplOptions get_impl_options;
1841
+ get_impl_options.column_family = column_family;
1842
+ get_impl_options.columns = columns;
1843
+
1844
+ return GetImpl(read_options, key, get_impl_options);
1845
+ }
1846
+
1825
1847
  bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) {
1826
1848
  // If both thresholds are reached, a function returning merge operands as
1827
1849
  // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or
@@ -1853,7 +1875,8 @@ bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) {
1853
1875
  Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
1854
1876
  GetImplOptions& get_impl_options) {
1855
1877
  assert(get_impl_options.value != nullptr ||
1856
- get_impl_options.merge_operands != nullptr);
1878
+ get_impl_options.merge_operands != nullptr ||
1879
+ get_impl_options.columns != nullptr);
1857
1880
 
1858
1881
  assert(get_impl_options.column_family);
1859
1882
 
@@ -1980,31 +2003,46 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
1980
2003
  if (!skip_memtable) {
1981
2004
  // Get value associated with key
1982
2005
  if (get_impl_options.get_value) {
1983
- if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), timestamp, &s,
1984
- &merge_context, &max_covering_tombstone_seq,
1985
- read_options, false /* immutable_memtable */,
1986
- get_impl_options.callback,
1987
- get_impl_options.is_blob_index)) {
2006
+ if (sv->mem->Get(
2007
+ lkey,
2008
+ get_impl_options.value ? get_impl_options.value->GetSelf()
2009
+ : nullptr,
2010
+ get_impl_options.columns, timestamp, &s, &merge_context,
2011
+ &max_covering_tombstone_seq, read_options,
2012
+ false /* immutable_memtable */, get_impl_options.callback,
2013
+ get_impl_options.is_blob_index)) {
1988
2014
  done = true;
1989
- get_impl_options.value->PinSelf();
2015
+
2016
+ if (get_impl_options.value) {
2017
+ get_impl_options.value->PinSelf();
2018
+ }
2019
+
1990
2020
  RecordTick(stats_, MEMTABLE_HIT);
1991
2021
  } else if ((s.ok() || s.IsMergeInProgress()) &&
1992
- sv->imm->Get(lkey, get_impl_options.value->GetSelf(),
1993
- timestamp, &s, &merge_context,
1994
- &max_covering_tombstone_seq, read_options,
1995
- get_impl_options.callback,
2022
+ sv->imm->Get(lkey,
2023
+ get_impl_options.value
2024
+ ? get_impl_options.value->GetSelf()
2025
+ : nullptr,
2026
+ get_impl_options.columns, timestamp, &s,
2027
+ &merge_context, &max_covering_tombstone_seq,
2028
+ read_options, get_impl_options.callback,
1996
2029
  get_impl_options.is_blob_index)) {
1997
2030
  done = true;
1998
- get_impl_options.value->PinSelf();
2031
+
2032
+ if (get_impl_options.value) {
2033
+ get_impl_options.value->PinSelf();
2034
+ }
2035
+
1999
2036
  RecordTick(stats_, MEMTABLE_HIT);
2000
2037
  }
2001
2038
  } else {
2002
2039
  // Get Merge Operands associated with key, Merge Operands should not be
2003
2040
  // merged and raw values should be returned to the user.
2004
- if (sv->mem->Get(lkey, /*value*/ nullptr, /*timestamp=*/nullptr, &s,
2005
- &merge_context, &max_covering_tombstone_seq,
2006
- read_options, false /* immutable_memtable */, nullptr,
2007
- nullptr, false)) {
2041
+ if (sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr,
2042
+ /*timestamp=*/nullptr, &s, &merge_context,
2043
+ &max_covering_tombstone_seq, read_options,
2044
+ false /* immutable_memtable */, nullptr, nullptr,
2045
+ false)) {
2008
2046
  done = true;
2009
2047
  RecordTick(stats_, MEMTABLE_HIT);
2010
2048
  } else if ((s.ok() || s.IsMergeInProgress()) &&
@@ -2026,8 +2064,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
2026
2064
  if (!done) {
2027
2065
  PERF_TIMER_GUARD(get_from_output_files_time);
2028
2066
  sv->current->Get(
2029
- read_options, lkey, get_impl_options.value, timestamp, &s,
2030
- &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
2067
+ read_options, lkey, get_impl_options.value, get_impl_options.columns,
2068
+ timestamp, &s, &merge_context, &max_covering_tombstone_seq,
2069
+ &pinned_iters_mgr,
2031
2070
  get_impl_options.get_value ? get_impl_options.value_found : nullptr,
2032
2071
  nullptr, nullptr,
2033
2072
  get_impl_options.get_value ? get_impl_options.callback : nullptr,
@@ -2043,7 +2082,11 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
2043
2082
  size_t size = 0;
2044
2083
  if (s.ok()) {
2045
2084
  if (get_impl_options.get_value) {
2046
- size = get_impl_options.value->size();
2085
+ if (get_impl_options.value) {
2086
+ size = get_impl_options.value->size();
2087
+ } else if (get_impl_options.columns) {
2088
+ size = get_impl_options.columns->serialized_size();
2089
+ }
2047
2090
  } else {
2048
2091
  // Return all merge operands for get_impl_options.key
2049
2092
  *get_impl_options.number_of_operands =
@@ -2252,14 +2295,14 @@ std::vector<Status> DBImpl::MultiGet(
2252
2295
  has_unpersisted_data_.load(std::memory_order_relaxed));
2253
2296
  bool done = false;
2254
2297
  if (!skip_memtable) {
2255
- if (super_version->mem->Get(lkey, value, timestamp, &s, &merge_context,
2256
- &max_covering_tombstone_seq, read_options,
2257
- false /* immutable_memtable */,
2258
- read_callback)) {
2298
+ if (super_version->mem->Get(
2299
+ lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context,
2300
+ &max_covering_tombstone_seq, read_options,
2301
+ false /* immutable_memtable */, read_callback)) {
2259
2302
  done = true;
2260
2303
  RecordTick(stats_, MEMTABLE_HIT);
2261
- } else if (super_version->imm->Get(lkey, value, timestamp, &s,
2262
- &merge_context,
2304
+ } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr,
2305
+ timestamp, &s, &merge_context,
2263
2306
  &max_covering_tombstone_seq,
2264
2307
  read_options, read_callback)) {
2265
2308
  done = true;
@@ -2270,9 +2313,9 @@ std::vector<Status> DBImpl::MultiGet(
2270
2313
  PinnableSlice pinnable_val;
2271
2314
  PERF_TIMER_GUARD(get_from_output_files_time);
2272
2315
  PinnedIteratorsManager pinned_iters_mgr;
2273
- super_version->current->Get(read_options, lkey, &pinnable_val, timestamp,
2274
- &s, &merge_context,
2275
- &max_covering_tombstone_seq,
2316
+ super_version->current->Get(read_options, lkey, &pinnable_val,
2317
+ /*columns=*/nullptr, timestamp, &s,
2318
+ &merge_context, &max_covering_tombstone_seq,
2276
2319
  &pinned_iters_mgr, /*value_found=*/nullptr,
2277
2320
  /*key_exists=*/nullptr,
2278
2321
  /*seq=*/nullptr, read_callback);
@@ -4861,8 +4904,8 @@ Status DBImpl::GetLatestSequenceForKey(
4861
4904
  *found_record_for_key = false;
4862
4905
 
4863
4906
  // Check if there is a record for this key in the latest memtable
4864
- sv->mem->Get(lkey, /*value=*/nullptr, timestamp, &s, &merge_context,
4865
- &max_covering_tombstone_seq, seq, read_options,
4907
+ sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
4908
+ &merge_context, &max_covering_tombstone_seq, seq, read_options,
4866
4909
  false /* immutable_memtable */, nullptr /*read_callback*/,
4867
4910
  is_blob_index);
4868
4911
 
@@ -4895,8 +4938,8 @@ Status DBImpl::GetLatestSequenceForKey(
4895
4938
  }
4896
4939
 
4897
4940
  // Check if there is a record for this key in the immutable memtables
4898
- sv->imm->Get(lkey, /*value=*/nullptr, timestamp, &s, &merge_context,
4899
- &max_covering_tombstone_seq, seq, read_options,
4941
+ sv->imm->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
4942
+ &merge_context, &max_covering_tombstone_seq, seq, read_options,
4900
4943
  nullptr /*read_callback*/, is_blob_index);
4901
4944
 
4902
4945
  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
@@ -4927,9 +4970,10 @@ Status DBImpl::GetLatestSequenceForKey(
4927
4970
  }
4928
4971
 
4929
4972
  // Check if there is a record for this key in the immutable memtables
4930
- sv->imm->GetFromHistory(lkey, /*value=*/nullptr, timestamp, &s,
4931
- &merge_context, &max_covering_tombstone_seq, seq,
4932
- read_options, is_blob_index);
4973
+ sv->imm->GetFromHistory(lkey, /*value=*/nullptr, /*columns=*/nullptr,
4974
+ timestamp, &s, &merge_context,
4975
+ &max_covering_tombstone_seq, seq, read_options,
4976
+ is_blob_index);
4933
4977
 
4934
4978
  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
4935
4979
  // unexpected error reading memtable.
@@ -4962,8 +5006,8 @@ Status DBImpl::GetLatestSequenceForKey(
4962
5006
  if (!cache_only) {
4963
5007
  // Check tables
4964
5008
  PinnedIteratorsManager pinned_iters_mgr;
4965
- sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s,
4966
- &merge_context, &max_covering_tombstone_seq,
5009
+ sv->current->Get(read_options, lkey, /*value=*/nullptr, /*columns=*/nullptr,
5010
+ timestamp, &s, &merge_context, &max_covering_tombstone_seq,
4967
5011
  &pinned_iters_mgr, nullptr /* value_found */,
4968
5012
  found_record_for_key, seq, nullptr /*read_callback*/,
4969
5013
  is_blob_index);
@@ -238,6 +238,11 @@ class DBImpl : public DB {
238
238
  ColumnFamilyHandle* column_family, const Slice& key,
239
239
  PinnableSlice* value, std::string* timestamp) override;
240
240
 
241
+ using DB::GetEntity;
242
+ Status GetEntity(const ReadOptions& options,
243
+ ColumnFamilyHandle* column_family, const Slice& key,
244
+ PinnableWideColumns* columns) override;
245
+
241
246
  using DB::GetMergeOperands;
242
247
  Status GetMergeOperands(const ReadOptions& options,
243
248
  ColumnFamilyHandle* column_family, const Slice& key,
@@ -592,6 +597,7 @@ class DBImpl : public DB {
592
597
  struct GetImplOptions {
593
598
  ColumnFamilyHandle* column_family = nullptr;
594
599
  PinnableSlice* value = nullptr;
600
+ PinnableWideColumns* columns = nullptr;
595
601
  std::string* timestamp = nullptr;
596
602
  bool* value_found = nullptr;
597
603
  ReadCallback* callback = nullptr;
@@ -85,18 +85,18 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
85
85
  SequenceNumber max_covering_tombstone_seq = 0;
86
86
  LookupKey lkey(key, snapshot, read_options.timestamp);
87
87
  PERF_TIMER_STOP(get_snapshot_time);
88
- if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), ts, &s,
89
- &merge_context, &max_covering_tombstone_seq,
90
- read_options, false /* immutable_memtable */,
91
- &read_cb)) {
88
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
89
+ /*columns=*/nullptr, ts, &s, &merge_context,
90
+ &max_covering_tombstone_seq, read_options,
91
+ false /* immutable_memtable */, &read_cb)) {
92
92
  pinnable_val->PinSelf();
93
93
  RecordTick(stats_, MEMTABLE_HIT);
94
94
  } else {
95
95
  PERF_TIMER_GUARD(get_from_output_files_time);
96
96
  PinnedIteratorsManager pinned_iters_mgr;
97
97
  super_version->current->Get(
98
- read_options, lkey, pinnable_val, ts, &s, &merge_context,
99
- &max_covering_tombstone_seq, &pinned_iters_mgr,
98
+ read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
99
+ &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
100
100
  /*value_found*/ nullptr,
101
101
  /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
102
102
  /*is_blob*/ nullptr,