@nxtedition/rocksdb 7.1.12 → 7.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/binding.cc +49 -48
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  3. package/deps/rocksdb/rocksdb/TARGETS +2 -0
  4. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +60 -17
  5. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +4 -4
  6. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +81 -37
  7. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +6 -0
  8. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -6
  9. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +10 -8
  10. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -9
  11. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +3 -3
  12. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +69 -0
  13. package/deps/rocksdb/rocksdb/db/flush_job.cc +6 -6
  14. package/deps/rocksdb/rocksdb/db/memtable.cc +19 -7
  15. package/deps/rocksdb/rocksdb/db/memtable.h +8 -16
  16. package/deps/rocksdb/rocksdb/db/memtable_list.cc +27 -16
  17. package/deps/rocksdb/rocksdb/db/memtable_list.h +18 -11
  18. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +70 -55
  19. package/deps/rocksdb/rocksdb/db/table_cache.cc +9 -11
  20. package/deps/rocksdb/rocksdb/db/table_cache.h +2 -1
  21. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +3 -3
  22. package/deps/rocksdb/rocksdb/db/version_set.cc +530 -257
  23. package/deps/rocksdb/rocksdb/db/version_set.h +32 -2
  24. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  25. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +64 -12
  26. package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +18 -0
  27. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +8 -0
  28. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +13 -1
  29. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -0
  30. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +83 -0
  31. package/deps/rocksdb/rocksdb/options/options.cc +4 -2
  32. package/deps/rocksdb/rocksdb/src.mk +1 -0
  33. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -10
  34. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +5 -4
  35. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +10 -28
  36. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +4 -4
  37. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +11 -9
  38. package/deps/rocksdb/rocksdb/table/get_context.cc +34 -22
  39. package/deps/rocksdb/rocksdb/table/get_context.h +6 -3
  40. package/deps/rocksdb/rocksdb/table/multiget_context.h +69 -5
  41. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -2
  42. package/deps/rocksdb/rocksdb/table/table_test.cc +8 -8
  43. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +23 -0
  44. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +27 -7
  45. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +8 -4
  46. package/deps/rocksdb/rocksdb.gyp +1 -0
  47. package/index.js +18 -17
  48. package/package.json +1 -1
  49. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  50. package/prebuilds/darwin-x64/node.napi.node +0 -0
  51. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -347,6 +347,7 @@ class FilePicker {
347
347
  return false;
348
348
  }
349
349
  };
350
+ } // anonymous namespace
350
351
 
351
352
  class FilePickerMultiGet {
352
353
  private:
@@ -362,20 +363,21 @@ class FilePickerMultiGet {
362
363
  curr_level_(static_cast<unsigned int>(-1)),
363
364
  returned_file_level_(static_cast<unsigned int>(-1)),
364
365
  hit_file_level_(static_cast<unsigned int>(-1)),
365
- range_(range),
366
- batch_iter_(range->begin()),
367
- batch_iter_prev_(range->begin()),
368
- upper_key_(range->begin()),
366
+ range_(*range, range->begin(), range->end()),
369
367
  maybe_repeat_key_(false),
370
368
  current_level_range_(*range, range->begin(), range->end()),
371
369
  current_file_range_(*range, range->begin(), range->end()),
370
+ batch_iter_(range->begin()),
371
+ batch_iter_prev_(range->begin()),
372
+ upper_key_(range->begin()),
372
373
  level_files_brief_(file_levels),
373
374
  is_hit_file_last_in_level_(false),
374
375
  curr_file_level_(nullptr),
375
376
  file_indexer_(file_indexer),
376
377
  user_comparator_(user_comparator),
377
- internal_comparator_(internal_comparator) {
378
- for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
378
+ internal_comparator_(internal_comparator),
379
+ hit_file_(nullptr) {
380
+ for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
379
381
  fp_ctx_array_[iter.index()] =
380
382
  FilePickerContext(0, FileIndexer::kLevelMaxIndex);
381
383
  }
@@ -391,7 +393,7 @@ class FilePickerMultiGet {
391
393
  for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
392
394
  auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
393
395
  if (r) {
394
- for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
396
+ for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
395
397
  r->Prepare(iter->ikey);
396
398
  }
397
399
  }
@@ -399,8 +401,186 @@ class FilePickerMultiGet {
399
401
  }
400
402
  }
401
403
 
404
+ FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other)
405
+ : num_levels_(other.num_levels_),
406
+ curr_level_(other.curr_level_),
407
+ returned_file_level_(other.returned_file_level_),
408
+ hit_file_level_(other.hit_file_level_),
409
+ fp_ctx_array_(other.fp_ctx_array_),
410
+ range_(*range, range->begin(), range->end()),
411
+ maybe_repeat_key_(false),
412
+ current_level_range_(*range, range->begin(), range->end()),
413
+ current_file_range_(*range, range->begin(), range->end()),
414
+ batch_iter_(range->begin()),
415
+ batch_iter_prev_(range->begin()),
416
+ upper_key_(range->begin()),
417
+ level_files_brief_(other.level_files_brief_),
418
+ is_hit_file_last_in_level_(false),
419
+ curr_file_level_(other.curr_file_level_),
420
+ file_indexer_(other.file_indexer_),
421
+ user_comparator_(other.user_comparator_),
422
+ internal_comparator_(other.internal_comparator_),
423
+ hit_file_(nullptr) {
424
+ PrepareNextLevelForSearch();
425
+ }
426
+
402
427
  int GetCurrentLevel() const { return curr_level_; }
403
428
 
429
+ void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }
430
+
431
+ FdWithKeyRange* GetNextFileInLevel() {
432
+ if (batch_iter_ == current_level_range_.end() || search_ended_) {
433
+ hit_file_ = nullptr;
434
+ return nullptr;
435
+ } else {
436
+ if (maybe_repeat_key_) {
437
+ maybe_repeat_key_ = false;
438
+ // Check if we found the final value for the last key in the
439
+ // previous lookup range. If we did, then there's no need to look
440
+ // any further for that key, so advance batch_iter_. Else, keep
441
+ // batch_iter_ positioned on that key so we look it up again in
442
+ // the next file
443
+ // For L0, always advance the key because we will look in the next
444
+ // file regardless for all keys not found yet
445
+ if (current_level_range_.CheckKeyDone(batch_iter_) ||
446
+ curr_level_ == 0) {
447
+ batch_iter_ = upper_key_;
448
+ }
449
+ }
450
+ // batch_iter_prev_ will become the start key for the next file
451
+ // lookup
452
+ batch_iter_prev_ = batch_iter_;
453
+ }
454
+
455
+ MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
456
+ current_level_range_.end());
457
+ size_t curr_file_index =
458
+ (batch_iter_ != current_level_range_.end())
459
+ ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
460
+ : curr_file_level_->num_files;
461
+ FdWithKeyRange* f;
462
+ bool is_last_key_in_file;
463
+ if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
464
+ &is_last_key_in_file)) {
465
+ hit_file_ = nullptr;
466
+ return nullptr;
467
+ } else {
468
+ if (is_last_key_in_file) {
469
+ // Since cmp_largest is 0, batch_iter_ still points to the last key
470
+ // that falls in this file, instead of the next one. Increment
471
+ // the file index for all keys between batch_iter_ and upper_key_
472
+ auto tmp_iter = batch_iter_;
473
+ while (tmp_iter != upper_key_) {
474
+ ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
475
+ ++tmp_iter;
476
+ }
477
+ maybe_repeat_key_ = true;
478
+ }
479
+ // Set the range for this file
480
+ current_file_range_ =
481
+ MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
482
+ returned_file_level_ = curr_level_;
483
+ hit_file_level_ = curr_level_;
484
+ is_hit_file_last_in_level_ =
485
+ curr_file_index == curr_file_level_->num_files - 1;
486
+ hit_file_ = f;
487
+ return f;
488
+ }
489
+ }
490
+
491
+ // getter for current file level
492
+ // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
493
+ unsigned int GetHitFileLevel() { return hit_file_level_; }
494
+
495
+ FdWithKeyRange* GetHitFile() { return hit_file_; }
496
+
497
+ // Returns true if the most recent "hit file" (i.e., one returned by
498
+ // GetNextFile()) is at the last index in its level.
499
+ bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
500
+
501
+ bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
502
+
503
+ bool IsSearchEnded() { return search_ended_; }
504
+
505
+ const MultiGetRange& CurrentFileRange() { return current_file_range_; }
506
+
507
+ bool RemainingOverlapInLevel() {
508
+ return !current_level_range_.Suffix(current_file_range_).empty();
509
+ }
510
+
511
+ MultiGetRange& GetRange() { return range_; }
512
+
513
+ void ReplaceRange(const MultiGetRange& other) {
514
+ range_ = other;
515
+ current_level_range_ = other;
516
+ }
517
+
518
+ FilePickerMultiGet(FilePickerMultiGet&& other)
519
+ : num_levels_(other.num_levels_),
520
+ curr_level_(other.curr_level_),
521
+ returned_file_level_(other.returned_file_level_),
522
+ hit_file_level_(other.hit_file_level_),
523
+ fp_ctx_array_(std::move(other.fp_ctx_array_)),
524
+ range_(std::move(other.range_)),
525
+ maybe_repeat_key_(other.maybe_repeat_key_),
526
+ current_level_range_(std::move(other.current_level_range_)),
527
+ current_file_range_(std::move(other.current_file_range_)),
528
+ batch_iter_(other.batch_iter_, &current_level_range_),
529
+ batch_iter_prev_(other.batch_iter_prev_, &current_level_range_),
530
+ upper_key_(other.upper_key_, &current_level_range_),
531
+ level_files_brief_(other.level_files_brief_),
532
+ search_ended_(other.search_ended_),
533
+ is_hit_file_last_in_level_(other.is_hit_file_last_in_level_),
534
+ curr_file_level_(other.curr_file_level_),
535
+ file_indexer_(other.file_indexer_),
536
+ user_comparator_(other.user_comparator_),
537
+ internal_comparator_(other.internal_comparator_),
538
+ hit_file_(other.hit_file_) {}
539
+
540
+ private:
541
+ unsigned int num_levels_;
542
+ unsigned int curr_level_;
543
+ unsigned int returned_file_level_;
544
+ unsigned int hit_file_level_;
545
+
546
+ struct FilePickerContext {
547
+ int32_t search_left_bound;
548
+ int32_t search_right_bound;
549
+ unsigned int curr_index_in_curr_level;
550
+ unsigned int start_index_in_curr_level;
551
+
552
+ FilePickerContext(int32_t left, int32_t right)
553
+ : search_left_bound(left),
554
+ search_right_bound(right),
555
+ curr_index_in_curr_level(0),
556
+ start_index_in_curr_level(0) {}
557
+
558
+ FilePickerContext() = default;
559
+ };
560
+ std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
561
+ MultiGetRange range_;
562
+ bool maybe_repeat_key_;
563
+ MultiGetRange current_level_range_;
564
+ MultiGetRange current_file_range_;
565
+ // Iterator to iterate through the keys in a MultiGet batch, that gets reset
566
+ // at the beginning of each level. Each call to GetNextFile() will position
567
+ // batch_iter_ at or right after the last key that was found in the returned
568
+ // SST file
569
+ MultiGetRange::Iterator batch_iter_;
570
+ // An iterator that records the previous position of batch_iter_, i.e last
571
+ // key found in the previous SST file, in order to serve as the start of
572
+ // the batch key range for the next SST file
573
+ MultiGetRange::Iterator batch_iter_prev_;
574
+ MultiGetRange::Iterator upper_key_;
575
+ autovector<LevelFilesBrief>* level_files_brief_;
576
+ bool search_ended_;
577
+ bool is_hit_file_last_in_level_;
578
+ LevelFilesBrief* curr_file_level_;
579
+ FileIndexer* file_indexer_;
580
+ const Comparator* user_comparator_;
581
+ const InternalKeyComparator* internal_comparator_;
582
+ FdWithKeyRange* hit_file_;
583
+
404
584
  // Iterates through files in the current level until it finds a file that
405
585
  // contains at least one key from the MultiGet batch
406
586
  bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
@@ -524,124 +704,6 @@ class FilePickerMultiGet {
524
704
  return file_hit;
525
705
  }
526
706
 
527
- void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }
528
-
529
- FdWithKeyRange* GetNextFileInLevel() {
530
- if (batch_iter_ == current_level_range_.end() || search_ended_) {
531
- return nullptr;
532
- } else {
533
- if (maybe_repeat_key_) {
534
- maybe_repeat_key_ = false;
535
- // Check if we found the final value for the last key in the
536
- // previous lookup range. If we did, then there's no need to look
537
- // any further for that key, so advance batch_iter_. Else, keep
538
- // batch_iter_ positioned on that key so we look it up again in
539
- // the next file
540
- // For L0, always advance the key because we will look in the next
541
- // file regardless for all keys not found yet
542
- if (current_level_range_.CheckKeyDone(batch_iter_) ||
543
- curr_level_ == 0) {
544
- batch_iter_ = upper_key_;
545
- }
546
- }
547
- // batch_iter_prev_ will become the start key for the next file
548
- // lookup
549
- batch_iter_prev_ = batch_iter_;
550
- }
551
-
552
- MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
553
- current_level_range_.end());
554
- size_t curr_file_index =
555
- (batch_iter_ != current_level_range_.end())
556
- ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
557
- : curr_file_level_->num_files;
558
- FdWithKeyRange* f;
559
- bool is_last_key_in_file;
560
- if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
561
- &is_last_key_in_file)) {
562
- return nullptr;
563
- } else {
564
- if (is_last_key_in_file) {
565
- // Since cmp_largest is 0, batch_iter_ still points to the last key
566
- // that falls in this file, instead of the next one. Increment
567
- // the file index for all keys between batch_iter_ and upper_key_
568
- auto tmp_iter = batch_iter_;
569
- while (tmp_iter != upper_key_) {
570
- ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
571
- ++tmp_iter;
572
- }
573
- maybe_repeat_key_ = true;
574
- }
575
- // Set the range for this file
576
- current_file_range_ =
577
- MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
578
- returned_file_level_ = curr_level_;
579
- hit_file_level_ = curr_level_;
580
- is_hit_file_last_in_level_ =
581
- curr_file_index == curr_file_level_->num_files - 1;
582
- return f;
583
- }
584
- }
585
-
586
- // getter for current file level
587
- // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
588
- unsigned int GetHitFileLevel() { return hit_file_level_; }
589
-
590
- // Returns true if the most recent "hit file" (i.e., one returned by
591
- // GetNextFile()) is at the last index in its level.
592
- bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
593
-
594
- bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
595
-
596
- bool IsSearchEnded() { return search_ended_; }
597
-
598
- const MultiGetRange& CurrentFileRange() { return current_file_range_; }
599
-
600
- bool RemainingOverlapInLevel() {
601
- return !current_level_range_.Suffix(current_file_range_).empty();
602
- }
603
-
604
- private:
605
- unsigned int num_levels_;
606
- unsigned int curr_level_;
607
- unsigned int returned_file_level_;
608
- unsigned int hit_file_level_;
609
-
610
- struct FilePickerContext {
611
- int32_t search_left_bound;
612
- int32_t search_right_bound;
613
- unsigned int curr_index_in_curr_level;
614
- unsigned int start_index_in_curr_level;
615
-
616
- FilePickerContext(int32_t left, int32_t right)
617
- : search_left_bound(left), search_right_bound(right),
618
- curr_index_in_curr_level(0), start_index_in_curr_level(0) {}
619
-
620
- FilePickerContext() = default;
621
- };
622
- std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
623
- MultiGetRange* range_;
624
- // Iterator to iterate through the keys in a MultiGet batch, that gets reset
625
- // at the beginning of each level. Each call to GetNextFile() will position
626
- // batch_iter_ at or right after the last key that was found in the returned
627
- // SST file
628
- MultiGetRange::Iterator batch_iter_;
629
- // An iterator that records the previous position of batch_iter_, i.e last
630
- // key found in the previous SST file, in order to serve as the start of
631
- // the batch key range for the next SST file
632
- MultiGetRange::Iterator batch_iter_prev_;
633
- MultiGetRange::Iterator upper_key_;
634
- bool maybe_repeat_key_;
635
- MultiGetRange current_level_range_;
636
- MultiGetRange current_file_range_;
637
- autovector<LevelFilesBrief>* level_files_brief_;
638
- bool search_ended_;
639
- bool is_hit_file_last_in_level_;
640
- LevelFilesBrief* curr_file_level_;
641
- FileIndexer* file_indexer_;
642
- const Comparator* user_comparator_;
643
- const InternalKeyComparator* internal_comparator_;
644
-
645
707
  // Setup local variables to search next level.
646
708
  // Returns false if there are no more levels to search.
647
709
  bool PrepareNextLevel() {
@@ -692,7 +754,7 @@ class FilePickerMultiGet {
692
754
  // are always compacted into a single entry).
693
755
  int32_t start_index = -1;
694
756
  current_level_range_ =
695
- MultiGetRange(*range_, range_->begin(), range_->end());
757
+ MultiGetRange(range_, range_.begin(), range_.end());
696
758
  for (auto mget_iter = current_level_range_.begin();
697
759
  mget_iter != current_level_range_.end(); ++mget_iter) {
698
760
  struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
@@ -754,7 +816,6 @@ class FilePickerMultiGet {
754
816
  return false;
755
817
  }
756
818
  };
757
- } // anonymous namespace
758
819
 
759
820
  VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
760
821
 
@@ -1969,7 +2030,8 @@ void Version::MultiGetBlob(
1969
2030
  }
1970
2031
 
1971
2032
  void Version::Get(const ReadOptions& read_options, const LookupKey& k,
1972
- PinnableSlice* value, std::string* timestamp, Status* status,
2033
+ PinnableSlice* value, PinnableWideColumns* columns,
2034
+ std::string* timestamp, Status* status,
1973
2035
  MergeContext* merge_context,
1974
2036
  SequenceNumber* max_covering_tombstone_seq,
1975
2037
  PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
@@ -2002,8 +2064,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
2002
2064
  GetContext get_context(
2003
2065
  user_comparator(), merge_operator_, info_log_, db_statistics_,
2004
2066
  status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
2005
- do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
2006
- merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
2067
+ do_merge ? value : nullptr, do_merge ? columns : nullptr,
2068
+ do_merge ? timestamp : nullptr, value_found, merge_context, do_merge,
2069
+ max_covering_tombstone_seq, clock_, seq,
2007
2070
  merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
2008
2071
  tracing_get_id, &blob_fetcher);
2009
2072
 
@@ -2171,9 +2234,10 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2171
2234
  get_ctx.emplace_back(
2172
2235
  user_comparator(), merge_operator_, info_log_, db_statistics_,
2173
2236
  iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
2174
- iter->ukey_with_ts, iter->value, iter->timestamp, nullptr,
2175
- &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_,
2176
- nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
2237
+ iter->ukey_with_ts, iter->value, /*columns=*/nullptr, iter->timestamp,
2238
+ nullptr, &(iter->merge_context), true,
2239
+ &iter->max_covering_tombstone_seq, clock_, nullptr,
2240
+ merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
2177
2241
  &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
2178
2242
  // MergeInProgress status, if set, has been transferred to the get_context
2179
2243
  // state, so we set status to ok here. From now on, the iter status will
@@ -2187,148 +2251,162 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2187
2251
  iter->get_context = &(get_ctx[get_ctx_index]);
2188
2252
  }
2189
2253
 
2190
- MultiGetRange file_picker_range(*range, range->begin(), range->end());
2191
- FilePickerMultiGet fp(
2192
- &file_picker_range,
2193
- &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
2194
- &storage_info_.file_indexer_, user_comparator(), internal_comparator());
2195
- FdWithKeyRange* f = fp.GetNextFileInLevel();
2196
2254
  Status s;
2197
- uint64_t num_index_read = 0;
2198
- uint64_t num_filter_read = 0;
2199
- uint64_t num_sst_read = 0;
2200
- uint64_t num_level_read = 0;
2201
-
2202
- MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
2203
2255
  // blob_file => [[blob_idx, it], ...]
2204
2256
  std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs;
2205
- int prev_level = -1;
2206
-
2207
- while (!fp.IsSearchEnded()) {
2208
- // This will be set to true later if we actually look up in a file in L0.
2209
- // For per level stats purposes, an L0 file is treated as a level
2210
- bool dump_stats_for_l0_file = false;
2211
-
2212
- // Avoid using the coroutine version if we're looking in a L0 file, since
2213
- // L0 files won't be parallelized anyway. The regular synchronous version
2214
- // is faster.
2215
- if (!read_options.async_io || !using_coroutines() ||
2216
- fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
2217
- if (f) {
2218
- bool skip_filters = IsFilterSkipped(
2219
- static_cast<int>(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel());
2220
- // Call MultiGetFromSST for looking up a single file
2221
- s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
2222
- fp.GetHitFileLevel(), skip_filters, f, blob_ctxs,
2223
- /*table_handle=*/nullptr, num_filter_read,
2224
- num_index_read, num_sst_read);
2225
- if (fp.GetHitFileLevel() == 0) {
2226
- dump_stats_for_l0_file = true;
2227
- }
2228
- }
2229
- if (s.ok()) {
2230
- f = fp.GetNextFileInLevel();
2231
- }
2257
+ MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
2232
2258
  #if USE_COROUTINES
2233
- } else {
2234
- std::vector<folly::coro::Task<Status>> mget_tasks;
2235
- while (f != nullptr) {
2236
- MultiGetRange file_range = fp.CurrentFileRange();
2237
- Cache::Handle* table_handle = nullptr;
2238
- bool skip_filters = IsFilterSkipped(
2239
- static_cast<int>(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel());
2240
- if (!skip_filters) {
2241
- Status status = table_cache_->MultiGetFilter(
2242
- read_options, *internal_comparator(), *f->file_metadata,
2243
- mutable_cf_options_.prefix_extractor,
2244
- cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2245
- fp.GetHitFileLevel(), &file_range, &table_handle);
2246
- if (status.ok()) {
2247
- skip_filters = true;
2248
- } else if (!status.IsNotSupported()) {
2249
- s = status;
2259
+ if (read_options.async_io && read_options.optimize_multiget_for_io &&
2260
+ using_coroutines()) {
2261
+ s = MultiGetAsync(read_options, range, &blob_ctxs);
2262
+ } else
2263
+ #endif // USE_COROUTINES
2264
+ {
2265
+ MultiGetRange file_picker_range(*range, range->begin(), range->end());
2266
+ FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_,
2267
+ storage_info_.num_non_empty_levels_,
2268
+ &storage_info_.file_indexer_, user_comparator(),
2269
+ internal_comparator());
2270
+ FdWithKeyRange* f = fp.GetNextFileInLevel();
2271
+ uint64_t num_index_read = 0;
2272
+ uint64_t num_filter_read = 0;
2273
+ uint64_t num_sst_read = 0;
2274
+ uint64_t num_level_read = 0;
2275
+
2276
+ int prev_level = -1;
2277
+
2278
+ while (!fp.IsSearchEnded()) {
2279
+ // This will be set to true later if we actually look up in a file in L0.
2280
+ // For per level stats purposes, an L0 file is treated as a level
2281
+ bool dump_stats_for_l0_file = false;
2282
+
2283
+ // Avoid using the coroutine version if we're looking in a L0 file, since
2284
+ // L0 files won't be parallelized anyway. The regular synchronous version
2285
+ // is faster.
2286
+ if (!read_options.async_io || !using_coroutines() ||
2287
+ fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
2288
+ if (f) {
2289
+ bool skip_filters =
2290
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
2291
+ fp.IsHitFileLastInLevel());
2292
+ // Call MultiGetFromSST for looking up a single file
2293
+ s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
2294
+ fp.GetHitFileLevel(), skip_filters,
2295
+ /*skip_range_deletions=*/false, f, blob_ctxs,
2296
+ /*table_handle=*/nullptr, num_filter_read,
2297
+ num_index_read, num_sst_read);
2298
+ if (fp.GetHitFileLevel() == 0) {
2299
+ dump_stats_for_l0_file = true;
2250
2300
  }
2251
2301
  }
2252
-
2253
- if (!s.ok()) {
2254
- break;
2302
+ if (s.ok()) {
2303
+ f = fp.GetNextFileInLevel();
2255
2304
  }
2305
+ #if USE_COROUTINES
2306
+ } else {
2307
+ std::vector<folly::coro::Task<Status>> mget_tasks;
2308
+ while (f != nullptr) {
2309
+ MultiGetRange file_range = fp.CurrentFileRange();
2310
+ Cache::Handle* table_handle = nullptr;
2311
+ bool skip_filters =
2312
+ IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
2313
+ fp.IsHitFileLastInLevel());
2314
+ bool skip_range_deletions = false;
2315
+ if (!skip_filters) {
2316
+ Status status = table_cache_->MultiGetFilter(
2317
+ read_options, *internal_comparator(), *f->file_metadata,
2318
+ mutable_cf_options_.prefix_extractor,
2319
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2320
+ fp.GetHitFileLevel(), &file_range, &table_handle);
2321
+ skip_range_deletions = true;
2322
+ if (status.ok()) {
2323
+ skip_filters = true;
2324
+ } else if (!status.IsNotSupported()) {
2325
+ s = status;
2326
+ }
2327
+ }
2256
2328
 
2257
- if (!file_range.empty()) {
2258
- mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
2259
- read_options, file_range, fp.GetHitFileLevel(), skip_filters, f,
2260
- blob_ctxs, table_handle, num_filter_read, num_index_read,
2261
- num_sst_read));
2262
- }
2263
- if (fp.KeyMaySpanNextFile()) {
2264
- break;
2265
- }
2266
- f = fp.GetNextFileInLevel();
2267
- }
2268
- if (s.ok() && mget_tasks.size() > 0) {
2269
- RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
2270
- // Collect all results so far
2271
- std::vector<Status> statuses = folly::coro::blockingWait(
2272
- folly::coro::collectAllRange(std::move(mget_tasks))
2273
- .scheduleOn(&range->context()->executor()));
2274
- for (Status stat : statuses) {
2275
- if (!stat.ok()) {
2276
- s = stat;
2329
+ if (!s.ok()) {
2330
+ break;
2277
2331
  }
2278
- }
2279
2332
 
2280
- if (s.ok() && fp.KeyMaySpanNextFile()) {
2333
+ if (!file_range.empty()) {
2334
+ mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
2335
+ read_options, file_range, fp.GetHitFileLevel(), skip_filters,
2336
+ skip_range_deletions, f, blob_ctxs, table_handle,
2337
+ num_filter_read, num_index_read, num_sst_read));
2338
+ }
2339
+ if (fp.KeyMaySpanNextFile()) {
2340
+ break;
2341
+ }
2281
2342
  f = fp.GetNextFileInLevel();
2282
2343
  }
2283
- }
2344
+ if (s.ok() && mget_tasks.size() > 0) {
2345
+ RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
2346
+ mget_tasks.size());
2347
+ // Collect all results so far
2348
+ std::vector<Status> statuses = folly::coro::blockingWait(
2349
+ folly::coro::collectAllRange(std::move(mget_tasks))
2350
+ .scheduleOn(&range->context()->executor()));
2351
+ for (Status stat : statuses) {
2352
+ if (!stat.ok()) {
2353
+ s = stat;
2354
+ }
2355
+ }
2356
+
2357
+ if (s.ok() && fp.KeyMaySpanNextFile()) {
2358
+ f = fp.GetNextFileInLevel();
2359
+ }
2360
+ }
2284
2361
  #endif // USE_COROUTINES
2285
- }
2286
- // If bad status or we found final result for all the keys
2287
- if (!s.ok() || file_picker_range.empty()) {
2288
- break;
2289
- }
2290
- if (!f) {
2291
- // Reached the end of this level. Prepare the next level
2292
- fp.PrepareNextLevelForSearch();
2293
- if (!fp.IsSearchEnded()) {
2294
- // Its possible there is no overlap on this level and f is nullptr
2295
- f = fp.GetNextFileInLevel();
2296
- }
2297
- if (dump_stats_for_l0_file ||
2298
- (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
2299
- // Dump the stats if the search has moved to the next level and
2300
- // reset for next level.
2301
- if (num_filter_read + num_index_read) {
2302
- RecordInHistogram(db_statistics_,
2303
- NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2304
- num_index_read + num_filter_read);
2362
+ }
2363
+ // If bad status or we found final result for all the keys
2364
+ if (!s.ok() || file_picker_range.empty()) {
2365
+ break;
2366
+ }
2367
+ if (!f) {
2368
+ // Reached the end of this level. Prepare the next level
2369
+ fp.PrepareNextLevelForSearch();
2370
+ if (!fp.IsSearchEnded()) {
2371
+ // Its possible there is no overlap on this level and f is nullptr
2372
+ f = fp.GetNextFileInLevel();
2305
2373
  }
2306
- if (num_sst_read) {
2307
- RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
2308
- num_sst_read);
2309
- num_level_read++;
2374
+ if (dump_stats_for_l0_file ||
2375
+ (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
2376
+ // Dump the stats if the search has moved to the next level and
2377
+ // reset for next level.
2378
+ if (num_filter_read + num_index_read) {
2379
+ RecordInHistogram(db_statistics_,
2380
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2381
+ num_index_read + num_filter_read);
2382
+ }
2383
+ if (num_sst_read) {
2384
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
2385
+ num_sst_read);
2386
+ num_level_read++;
2387
+ }
2388
+ num_filter_read = 0;
2389
+ num_index_read = 0;
2390
+ num_sst_read = 0;
2310
2391
  }
2311
- num_filter_read = 0;
2312
- num_index_read = 0;
2313
- num_sst_read = 0;
2392
+ prev_level = fp.GetHitFileLevel();
2314
2393
  }
2315
- prev_level = fp.GetHitFileLevel();
2316
2394
  }
2317
- }
2318
2395
 
2319
- // Dump stats for most recent level
2320
- if (num_filter_read + num_index_read) {
2321
- RecordInHistogram(db_statistics_,
2322
- NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2323
- num_index_read + num_filter_read);
2324
- }
2325
- if (num_sst_read) {
2326
- RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2327
- num_level_read++;
2328
- }
2329
- if (num_level_read) {
2330
- RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
2331
- num_level_read);
2396
+ // Dump stats for most recent level
2397
+ if (num_filter_read + num_index_read) {
2398
+ RecordInHistogram(db_statistics_,
2399
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2400
+ num_index_read + num_filter_read);
2401
+ }
2402
+ if (num_sst_read) {
2403
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2404
+ num_level_read++;
2405
+ }
2406
+ if (num_level_read) {
2407
+ RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
2408
+ num_level_read);
2409
+ }
2332
2410
  }
2333
2411
 
2334
2412
  if (s.ok() && !blob_ctxs.empty()) {
@@ -2380,6 +2458,201 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2380
2458
  }
2381
2459
  }
2382
2460
 
2461
+ #ifdef USE_COROUTINES
2462
+ Status Version::ProcessBatch(
2463
+ const ReadOptions& read_options, FilePickerMultiGet* batch,
2464
+ std::vector<folly::coro::Task<Status>>& mget_tasks,
2465
+ std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
2466
+ autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
2467
+ std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
2468
+ uint64_t& num_filter_read, uint64_t& num_index_read,
2469
+ uint64_t& num_sst_read) {
2470
+ FilePickerMultiGet& fp = *batch;
2471
+ MultiGetRange range = fp.GetRange();
2472
+ // Initialize a new empty range. Any keys that are not in this level will
2473
+ // eventually become part of the new range.
2474
+ MultiGetRange leftover(range, range.begin(), range.begin());
2475
+ FdWithKeyRange* f = nullptr;
2476
+ Status s;
2477
+
2478
+ f = fp.GetNextFileInLevel();
2479
+ while (!f) {
2480
+ fp.PrepareNextLevelForSearch();
2481
+ if (!fp.IsSearchEnded()) {
2482
+ f = fp.GetNextFileInLevel();
2483
+ } else {
2484
+ break;
2485
+ }
2486
+ }
2487
+ while (f) {
2488
+ MultiGetRange file_range = fp.CurrentFileRange();
2489
+ Cache::Handle* table_handle = nullptr;
2490
+ bool skip_filters = IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
2491
+ fp.IsHitFileLastInLevel());
2492
+ bool skip_range_deletions = false;
2493
+ if (!skip_filters) {
2494
+ Status status = table_cache_->MultiGetFilter(
2495
+ read_options, *internal_comparator(), *f->file_metadata,
2496
+ mutable_cf_options_.prefix_extractor,
2497
+ cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2498
+ fp.GetHitFileLevel(), &file_range, &table_handle);
2499
+ if (status.ok()) {
2500
+ skip_filters = true;
2501
+ skip_range_deletions = true;
2502
+ } else if (!status.IsNotSupported()) {
2503
+ s = status;
2504
+ }
2505
+ }
2506
+ if (!s.ok()) {
2507
+ break;
2508
+ }
2509
+ // At this point, file_range contains any keys that are likely in this
2510
+ // file. It may have false positives, but that's ok since higher level
2511
+ // lookups for the key are dependent on this lookup anyway.
2512
+ // Add the complement of file_range to leftover. That's the set of keys
2513
+ // definitely not in this level.
2514
+ // Subtract the complement of file_range from range, since they will be
2515
+ // processed in a separate batch in parallel.
2516
+ leftover += ~file_range;
2517
+ range -= ~file_range;
2518
+ if (!file_range.empty()) {
2519
+ if (waiting.empty() && to_process.empty() &&
2520
+ !fp.RemainingOverlapInLevel() && leftover.empty() &&
2521
+ mget_tasks.empty()) {
2522
+ // All keys are in one SST file, so take the fast path
2523
+ s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(),
2524
+ skip_filters, skip_range_deletions, f, *blob_ctxs,
2525
+ table_handle, num_filter_read, num_index_read,
2526
+ num_sst_read);
2527
+ } else {
2528
+ mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
2529
+ read_options, file_range, fp.GetHitFileLevel(), skip_filters,
2530
+ skip_range_deletions, f, *blob_ctxs, table_handle, num_filter_read,
2531
+ num_index_read, num_sst_read));
2532
+ ++num_tasks_queued;
2533
+ }
2534
+ }
2535
+ if (fp.KeyMaySpanNextFile() && !file_range.empty()) {
2536
+ break;
2537
+ }
2538
+ f = fp.GetNextFileInLevel();
2539
+ }
2540
+ // Split the current batch only if some keys are likely in this level and
2541
+ // some are not.
2542
+ if (s.ok() && !leftover.empty() && !range.empty()) {
2543
+ fp.ReplaceRange(range);
2544
+ batches.emplace_back(&leftover, fp);
2545
+ to_process.emplace_back(batches.size() - 1);
2546
+ }
2547
+ // 1. If f is non-null, that means we might not be done with this level.
2548
+ // This can happen if one of the keys is the last key in the file, i.e
2549
+ // fp.KeyMaySpanNextFile() is true.
2550
+ // 2. If range is empty, then we're done with this range and no need to
2551
+ // prepare the next level
2552
+ // 3. If some tasks were queued for this range, then the next level will be
2553
+ // prepared after executing those tasks
2554
+ if (!f && !range.empty() && !num_tasks_queued) {
2555
+ fp.PrepareNextLevelForSearch();
2556
+ }
2557
+ return s;
2558
+ }
2559
+
2560
+ Status Version::MultiGetAsync(
2561
+ const ReadOptions& options, MultiGetRange* range,
2562
+ std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs) {
2563
+ autovector<FilePickerMultiGet, 4> batches;
2564
+ std::deque<size_t> waiting;
2565
+ std::deque<size_t> to_process;
2566
+ Status s;
2567
+ std::vector<folly::coro::Task<Status>> mget_tasks;
2568
+ uint64_t num_filter_read = 0;
2569
+ uint64_t num_index_read = 0;
2570
+ uint64_t num_sst_read = 0;
2571
+
2572
+ // Create the initial batch with the input range
2573
+ batches.emplace_back(range, &storage_info_.level_files_brief_,
2574
+ storage_info_.num_non_empty_levels_,
2575
+ &storage_info_.file_indexer_, user_comparator(),
2576
+ internal_comparator());
2577
+ to_process.emplace_back(0);
2578
+
2579
+ while (!to_process.empty()) {
2580
+ size_t idx = to_process.front();
2581
+ FilePickerMultiGet* batch = &batches.at(idx);
2582
+ unsigned int num_tasks_queued = 0;
2583
+ to_process.pop_front();
2584
+ if (batch->IsSearchEnded() || batch->GetRange().empty()) {
2585
+ if (!to_process.empty()) {
2586
+ continue;
2587
+ }
2588
+ } else {
2589
+ // Look through one level. This may split the batch and enqueue it to
2590
+ // to_process
2591
+ s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
2592
+ to_process, num_tasks_queued, num_filter_read,
2593
+ num_index_read, num_sst_read);
2594
+ if (!s.ok()) {
2595
+ break;
2596
+ }
2597
+ // Dump the stats since the search has moved to the next level
2598
+ if (num_filter_read + num_index_read) {
2599
+ RecordInHistogram(db_statistics_,
2600
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
2601
+ num_index_read + num_filter_read);
2602
+ }
2603
+ if (num_sst_read) {
2604
+ RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
2605
+ }
2606
+ // If ProcessBatch didn't enqueue any coroutine tasks, it means all
2607
+ // keys were filtered out. So put the batch back in to_process to
2608
+ // lookup in the next level
2609
+ if (!num_tasks_queued && !batch->IsSearchEnded()) {
2610
+ // Put this back in the processing queue
2611
+ to_process.emplace_back(idx);
2612
+ } else if (num_tasks_queued) {
2613
+ waiting.emplace_back(idx);
2614
+ }
2615
+ }
2616
+ if (to_process.empty()) {
2617
+ if (s.ok() && mget_tasks.size() > 0) {
2618
+ assert(waiting.size());
2619
+ RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
2620
+ // Collect all results so far
2621
+ std::vector<Status> statuses = folly::coro::blockingWait(
2622
+ folly::coro::collectAllRange(std::move(mget_tasks))
2623
+ .scheduleOn(&range->context()->executor()));
2624
+ for (Status stat : statuses) {
2625
+ if (!stat.ok()) {
2626
+ s = stat;
2627
+ break;
2628
+ }
2629
+ }
2630
+
2631
+ if (!s.ok()) {
2632
+ break;
2633
+ }
2634
+
2635
+ for (size_t wait_idx : waiting) {
2636
+ FilePickerMultiGet& fp = batches.at(wait_idx);
2637
+ // 1. If fp.GetHitFile() is non-null, then there could be more
2638
+ // overlap in this level. So skip preparing next level.
2639
+ // 2. If fp.GetRange() is empty, then this batch is completed
2640
+ // and no need to prepare the next level.
2641
+ if (!fp.GetHitFile() && !fp.GetRange().empty()) {
2642
+ fp.PrepareNextLevelForSearch();
2643
+ }
2644
+ }
2645
+ to_process.swap(waiting);
2646
+ } else {
2647
+ assert(!s.ok() || waiting.size() == 0);
2648
+ }
2649
+ }
2650
+ }
2651
+
2652
+ return s;
2653
+ }
2654
+ #endif
2655
+
2383
2656
  bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
2384
2657
  // Reaching the bottom level implies misses at all upper levels, so we'll
2385
2658
  // skip checking the filters when we predict a hit.