@nxtedition/rocksdb 7.0.23 → 7.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/binding.cc +3 -1
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
  3. package/deps/rocksdb/rocksdb/Makefile +6 -2
  4. package/deps/rocksdb/rocksdb/TARGETS +14 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
  6. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
  21. package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
  25. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
  26. package/deps/rocksdb/rocksdb/db/c.cc +68 -0
  27. package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
  28. package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
  49. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
  51. package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
  52. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
  55. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
  61. package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
  62. package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
  63. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
  64. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
  65. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  67. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
  68. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
  69. package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
  70. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
  71. package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
  72. package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
  73. package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
  74. package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
  75. package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
  76. package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
  78. package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
  79. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  80. package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
  82. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
  93. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
  94. package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
  95. package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
  96. package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
  97. package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
  98. package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
  99. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
  100. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
  101. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
  102. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
  103. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
  104. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
  106. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
  107. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
  108. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
  111. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
  113. package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
  115. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
  116. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
  117. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
  118. package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
  119. package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
  120. package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
  121. package/deps/rocksdb/rocksdb/src.mk +5 -0
  122. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
  124. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
  125. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
  127. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
  128. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  131. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
  132. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
  133. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
  134. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
  135. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  136. package/deps/rocksdb/rocksdb/util/compression.h +2 -0
  137. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
  138. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
  139. package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
  140. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
  141. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
  143. package/deps/rocksdb/rocksdb.gyp +5 -1
  144. package/package.json +1 -1
  145. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  146. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -446,29 +446,17 @@ IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
446
446
  }
447
447
  }
448
448
 
449
- // TODO akanksha:
450
- // 1. Handle use_direct_io case which currently calls Read API.
451
449
  IOStatus RandomAccessFileReader::ReadAsync(
452
450
  FSReadRequest& req, const IOOptions& opts,
453
451
  std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
454
- void** io_handle, IOHandleDeleter* del_fn,
455
- Env::IOPriority rate_limiter_priority) {
456
- if (use_direct_io()) {
457
- // For direct_io, it calls Read API.
458
- req.status = Read(opts, req.offset, req.len, &(req.result), req.scratch,
459
- nullptr /*dbg*/, rate_limiter_priority);
460
- cb(req, cb_arg);
461
- return IOStatus::OK();
462
- }
463
-
452
+ void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
453
+ IOStatus s;
464
454
  // Create a callback and populate info.
465
455
  auto read_async_callback =
466
456
  std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,
467
457
  std::placeholders::_1, std::placeholders::_2);
468
- ReadAsyncInfo* read_async_info = new ReadAsyncInfo;
469
- read_async_info->cb_ = cb;
470
- read_async_info->cb_arg_ = cb_arg;
471
- read_async_info->start_time_ = clock_->NowMicros();
458
+ ReadAsyncInfo* read_async_info =
459
+ new ReadAsyncInfo(cb, cb_arg, clock_->NowMicros());
472
460
 
473
461
  #ifndef ROCKSDB_LITE
474
462
  if (ShouldNotifyListeners()) {
@@ -476,8 +464,38 @@ IOStatus RandomAccessFileReader::ReadAsync(
476
464
  }
477
465
  #endif
478
466
 
479
- IOStatus s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
480
- io_handle, del_fn, nullptr /*dbg*/);
467
+ size_t alignment = file_->GetRequiredBufferAlignment();
468
+ bool is_aligned = (req.offset & (alignment - 1)) == 0 &&
469
+ (req.len & (alignment - 1)) == 0 &&
470
+ (uintptr_t(req.scratch) & (alignment - 1)) == 0;
471
+ read_async_info->is_aligned_ = is_aligned;
472
+
473
+ if (use_direct_io() && is_aligned == false) {
474
+ FSReadRequest aligned_req = Align(req, alignment);
475
+
476
+ // Allocate aligned buffer.
477
+ read_async_info->buf_.Alignment(alignment);
478
+ read_async_info->buf_.AllocateNewBuffer(aligned_req.len);
479
+
480
+ // Set rem fields in aligned FSReadRequest.
481
+ aligned_req.scratch = read_async_info->buf_.BufferStart();
482
+
483
+ // Set user provided fields to populate back in callback.
484
+ read_async_info->user_scratch_ = req.scratch;
485
+ read_async_info->user_aligned_buf_ = aligned_buf;
486
+ read_async_info->user_len_ = req.len;
487
+ read_async_info->user_offset_ = req.offset;
488
+ read_async_info->user_result_ = req.result;
489
+
490
+ assert(read_async_info->buf_.CurrentSize() == 0);
491
+
492
+ s = file_->ReadAsync(aligned_req, opts, read_async_callback,
493
+ read_async_info, io_handle, del_fn, nullptr /*dbg*/);
494
+ } else {
495
+ s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
496
+ io_handle, del_fn, nullptr /*dbg*/);
497
+ }
498
+
481
499
  // Suppress false positive clang analyzer warnings.
482
500
  // Memory is not released if file_->ReadAsync returns !s.ok(), because
483
501
  // ReadAsyncCallback is never called in that case. If ReadAsyncCallback is
@@ -497,7 +515,54 @@ void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req,
497
515
  assert(read_async_info);
498
516
  assert(read_async_info->cb_);
499
517
 
500
- read_async_info->cb_(req, read_async_info->cb_arg_);
518
+ if (use_direct_io() && read_async_info->is_aligned_ == false) {
519
+ // Create FSReadRequest with user provided fields.
520
+ FSReadRequest user_req;
521
+ user_req.scratch = read_async_info->user_scratch_;
522
+ user_req.offset = read_async_info->user_offset_;
523
+ user_req.len = read_async_info->user_len_;
524
+
525
+ // Update results in user_req.
526
+ user_req.result = req.result;
527
+ user_req.status = req.status;
528
+
529
+ read_async_info->buf_.Size(read_async_info->buf_.CurrentSize() +
530
+ req.result.size());
531
+
532
+ size_t offset_advance_len = static_cast<size_t>(
533
+ /*offset_passed_by_user=*/read_async_info->user_offset_ -
534
+ /*aligned_offset=*/req.offset);
535
+
536
+ size_t res_len = 0;
537
+ if (req.status.ok() &&
538
+ offset_advance_len < read_async_info->buf_.CurrentSize()) {
539
+ res_len =
540
+ std::min(read_async_info->buf_.CurrentSize() - offset_advance_len,
541
+ read_async_info->user_len_);
542
+ if (read_async_info->user_aligned_buf_ == nullptr) {
543
+ // Copy the data into user's scratch.
544
+ // Clang analyzer assumes that it will take use_direct_io() == false in
545
+ // ReadAsync and use_direct_io() == true in Callback which cannot be true.
546
+ #ifndef __clang_analyzer__
547
+ read_async_info->buf_.Read(user_req.scratch, offset_advance_len,
548
+ res_len);
549
+ #endif // __clang_analyzer__
550
+ } else {
551
+ // Set aligned_buf provided by user without additional copy.
552
+ user_req.scratch =
553
+ read_async_info->buf_.BufferStart() + offset_advance_len;
554
+ read_async_info->user_aligned_buf_->reset(
555
+ read_async_info->buf_.Release());
556
+ }
557
+ user_req.result = Slice(user_req.scratch, res_len);
558
+ } else {
559
+ // Either req.status is not ok or data was not read.
560
+ user_req.result = Slice();
561
+ }
562
+ read_async_info->cb_(user_req, read_async_info->cb_arg_);
563
+ } else {
564
+ read_async_info->cb_(req, read_async_info->cb_arg_);
565
+ }
501
566
 
502
567
  // Update stats and notify listeners.
503
568
  if (stats_ != nullptr && file_read_hist_ != nullptr) {
@@ -93,12 +93,32 @@ class RandomAccessFileReader {
93
93
  const bool is_last_level_;
94
94
 
95
95
  struct ReadAsyncInfo {
96
+ ReadAsyncInfo(std::function<void(const FSReadRequest&, void*)> cb,
97
+ void* cb_arg, uint64_t start_time)
98
+ : cb_(cb),
99
+ cb_arg_(cb_arg),
100
+ start_time_(start_time),
101
+ user_scratch_(nullptr),
102
+ user_aligned_buf_(nullptr),
103
+ user_offset_(0),
104
+ user_len_(0),
105
+ is_aligned_(false) {}
106
+
107
+ std::function<void(const FSReadRequest&, void*)> cb_;
108
+ void* cb_arg_;
109
+ uint64_t start_time_;
96
110
  #ifndef ROCKSDB_LITE
97
111
  FileOperationInfo::StartTimePoint fs_start_ts_;
98
112
  #endif
99
- uint64_t start_time_;
100
- std::function<void(const FSReadRequest&, void*)> cb_;
101
- void* cb_arg_;
113
+ // Below fields stores the parameters passed by caller in case of direct_io.
114
+ char* user_scratch_;
115
+ AlignedBuf* user_aligned_buf_;
116
+ uint64_t user_offset_;
117
+ size_t user_len_;
118
+ Slice user_result_;
119
+ // Used in case of direct_io
120
+ AlignedBuffer buf_;
121
+ bool is_aligned_;
102
122
  };
103
123
 
104
124
  public:
@@ -190,7 +210,7 @@ class RandomAccessFileReader {
190
210
  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
191
211
  std::function<void(const FSReadRequest&, void*)> cb,
192
212
  void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
193
- Env::IOPriority rate_limiter_priority);
213
+ AlignedBuf* aligned_buf);
194
214
 
195
215
  void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg);
196
216
  };
@@ -117,7 +117,7 @@ struct CompressionOptions {
117
117
  //
118
118
  // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This
119
119
  // buffered memory is charged to the block cache when there is a block cache.
120
- // If block cache insertion fails with `Status::Incomplete` (i.e., it is
120
+ // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is
121
121
  // full), we finalize the dictionary with whatever data we have and then stop
122
122
  // buffering.
123
123
  //
@@ -949,6 +949,28 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iter
949
949
  rocksdb_iterator_t* base_iterator,
950
950
  rocksdb_column_family_handle_t* cf);
951
951
 
952
+ /* Options utils */
953
+
954
+ // Load the latest rocksdb options from the specified db_path.
955
+ //
956
+ // On success, num_column_families will be updated with a non-zero
957
+ // number indicating the number of column families.
958
+ // The returned db_options, column_family_names, and column_family_options
959
+ // should be released via rocksdb_load_latest_options_destroy().
960
+ //
961
+ // On error, a non-null errptr that includes the error message will be
962
+ // returned. db_options, column_family_names, and column_family_options
963
+ // will be set to NULL.
964
+ extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options(
965
+ const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
966
+ rocksdb_cache_t* cache, rocksdb_options_t** db_options,
967
+ size_t* num_column_families, char*** column_family_names,
968
+ rocksdb_options_t*** column_family_options, char** errptr);
969
+
970
+ extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options_destroy(
971
+ rocksdb_options_t* db_options, char** list_column_family_names,
972
+ rocksdb_options_t** list_column_family_options, size_t len);
973
+
952
974
  /* Block based table options */
953
975
 
954
976
  extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
@@ -1668,7 +1690,13 @@ enum {
1668
1690
  rocksdb_env_unlock_file_nanos,
1669
1691
  rocksdb_env_new_logger_nanos,
1670
1692
  rocksdb_number_async_seek,
1671
- rocksdb_total_metric_count = 69
1693
+ rocksdb_blob_cache_hit_count,
1694
+ rocksdb_blob_read_count,
1695
+ rocksdb_blob_read_byte,
1696
+ rocksdb_blob_read_time,
1697
+ rocksdb_blob_checksum_time,
1698
+ rocksdb_blob_decompress_time,
1699
+ rocksdb_total_metric_count = 77
1672
1700
  };
1673
1701
 
1674
1702
  extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
@@ -1939,6 +1967,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy(
1939
1967
  rocksdb_lru_cache_options_t*);
1940
1968
  extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity(
1941
1969
  rocksdb_lru_cache_options_t*, size_t);
1970
+ extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_num_shard_bits(
1971
+ rocksdb_lru_cache_options_t*, int);
1942
1972
  extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator(
1943
1973
  rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*);
1944
1974
 
@@ -174,14 +174,18 @@ extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
174
174
  extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
175
175
  const CompressedSecondaryCacheOptions& opts);
176
176
 
177
- // Similar to NewLRUCache, but create a cache based on CLOCK algorithm with
177
+ // EXPERIMENTAL Currently ClockCache is under development, although it's
178
+ // already exposed in the public API. To avoid unreliable performance and
179
+ // correctness issues, NewClockCache will temporarily return an LRUCache
180
+ // constructed with the corresponding arguments.
181
+ //
182
+ // TODO(Guido) When ClockCache is complete, roll back to the old text:
183
+ // ``
184
+ // Similar to NewLRUCache, but create a cache based on clock algorithm with
178
185
  // better concurrent performance in some cases. See util/clock_cache.cc for
179
186
  // more detail.
180
- //
181
187
  // Return nullptr if it is not supported.
182
- //
183
- // BROKEN: ClockCache is known to have bugs that could lead to crash or
184
- // corruption, so should not be used until fixed. Use NewLRUCache instead.
188
+ // ``
185
189
  extern std::shared_ptr<Cache> NewClockCache(
186
190
  size_t capacity, int num_shard_bits = -1,
187
191
  bool strict_capacity_limit = false,
@@ -292,7 +296,7 @@ class Cache {
292
296
  // Insert a mapping from key->value into the volatile cache only
293
297
  // and assign it with the specified charge against the total cache capacity.
294
298
  // If strict_capacity_limit is true and cache reaches its full capacity,
295
- // return Status::Incomplete.
299
+ // return Status::MemoryLimit.
296
300
  //
297
301
  // If handle is not nullptr, returns a handle that corresponds to the
298
302
  // mapping. The caller must call this->Release(handle) when the returned
@@ -450,7 +454,7 @@ class Cache {
450
454
  // Insert a mapping from key->value into the cache and assign it
451
455
  // the specified charge against the total cache capacity.
452
456
  // If strict_capacity_limit is true and cache reaches its full capacity,
453
- // return Status::Incomplete.
457
+ // return Status::MemoryLimit.
454
458
  //
455
459
  // The helper argument is saved by the cache and will be used when the
456
460
  // inserted object is evicted or promoted to the secondary cache. It,
@@ -102,5 +102,7 @@ struct CompactionJobStats {
102
102
 
103
103
  // number of single-deletes which meet something other than a put
104
104
  uint64_t num_single_del_mismatch;
105
+
106
+ // TODO: Add output_to_penultimate_level output information
105
107
  };
106
108
  } // namespace ROCKSDB_NAMESPACE
@@ -1080,6 +1080,17 @@ class DB {
1080
1080
  // "rocksdb.live-blob-file-garbage-size" - returns the total amount of
1081
1081
  // garbage in the blob files in the current version.
1082
1082
  static const std::string kLiveBlobFileGarbageSize;
1083
+
1084
+ // "rocksdb.blob-cache-capacity" - returns blob cache capacity.
1085
+ static const std::string kBlobCacheCapacity;
1086
+
1087
+ // "rocksdb.blob-cache-usage" - returns the memory size for the entries
1088
+ // residing in blob cache.
1089
+ static const std::string kBlobCacheUsage;
1090
+
1091
+ // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the
1092
+ // entries being pinned in blob cache.
1093
+ static const std::string kBlobCachePinnedUsage;
1083
1094
  };
1084
1095
  #endif /* ROCKSDB_LITE */
1085
1096
 
@@ -1145,6 +1156,9 @@ class DB {
1145
1156
  // "rocksdb.num-blob-files"
1146
1157
  // "rocksdb.total-blob-file-size"
1147
1158
  // "rocksdb.live-blob-file-size"
1159
+ // "rocksdb.blob-cache-capacity"
1160
+ // "rocksdb.blob-cache-usage"
1161
+ // "rocksdb.blob-cache-pinned-usage"
1148
1162
  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
1149
1163
  const Slice& property, uint64_t* value) = 0;
1150
1164
  virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
@@ -493,6 +493,17 @@ class Env : public Customizable {
493
493
  // Wait for all threads started by StartThread to terminate.
494
494
  virtual void WaitForJoin() {}
495
495
 
496
+ // Reserve available background threads in the specified thread pool.
497
+ virtual int ReserveThreads(int /*threads_to_be_reserved*/, Priority /*pri*/) {
498
+ return 0;
499
+ }
500
+
501
+ // Release a specific number of reserved threads from the specified thread
502
+ // pool
503
+ virtual int ReleaseThreads(int /*threads_to_be_released*/, Priority /*pri*/) {
504
+ return 0;
505
+ }
506
+
496
507
  // Get thread pool queue length for specific thread pool.
497
508
  virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const {
498
509
  return 0;
@@ -1533,6 +1544,15 @@ class EnvWrapper : public Env {
1533
1544
  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
1534
1545
  return target_.env->GetThreadPoolQueueLen(pri);
1535
1546
  }
1547
+
1548
+ int ReserveThreads(int threads_to_be_reserved, Priority pri) override {
1549
+ return target_.env->ReserveThreads(threads_to_be_reserved, pri);
1550
+ }
1551
+
1552
+ int ReleaseThreads(int threads_to_be_released, Priority pri) override {
1553
+ return target_.env->ReleaseThreads(threads_to_be_released, pri);
1554
+ }
1555
+
1536
1556
  Status GetTestDirectory(std::string* path) override {
1537
1557
  return target_.env->GetTestDirectory(path);
1538
1558
  }
@@ -240,18 +240,36 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
240
240
  // Dynamically changeable through SetOptions() API
241
241
  int level0_file_num_compaction_trigger = 4;
242
242
 
243
- // If non-nullptr, use the specified function to determine the
244
- // prefixes for keys. These prefixes will be placed in the filter.
245
- // Depending on the workload, this can reduce the number of read-IOP
246
- // cost for scans when a prefix is passed via ReadOptions to
247
- // db.NewIterator(). For prefix filtering to work properly,
248
- // "prefix_extractor" and "comparator" must be such that the following
249
- // properties hold:
250
- //
251
- // 1) key.starts_with(prefix(key))
252
- // 2) Compare(prefix(key), key) <= 0.
253
- // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
254
- // 4) prefix(prefix(key)) == prefix(key)
243
+ // If non-nullptr, use the specified function to put keys in contiguous
244
+ // groups called "prefixes". These prefixes are used to place one
245
+ // representative entry for the group into the Bloom filter
246
+ // rather than an entry for each key (see whole_key_filtering).
247
+ // Under certain conditions, this enables optimizing some range queries
248
+ // (Iterators) in addition to some point lookups (Get/MultiGet).
249
+ //
250
+ // Together `prefix_extractor` and `comparator` must satisfy one essential
251
+ // property for valid prefix filtering of range queries:
252
+ // If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
253
+ // InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
254
+ // Then InDomain(k2) and prefix(k2) == prefix(k1)
255
+ //
256
+ // In other words, all keys with the same prefix must be in a contiguous
257
+ // group by comparator order, and cannot be interrupted by keys with no
258
+ // prefix ("out of domain"). (This makes it valid to conclude that no
259
+ // entries within some bounds are present if the upper and lower bounds
260
+ // have a common prefix and no entries with that same prefix are present.)
261
+ //
262
+ // Some other properties are recommended but not strictly required. Under
263
+ // most sensible comparators, the following will need to hold true to
264
+ // satisfy the essential property above:
265
+ // * "Prefix is a prefix": key.starts_with(prefix(key))
266
+ // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
267
+ // Compare(prefix(k1), prefix(k2)) <= 0
268
+ //
269
+ // The next two properties ensure that seeking to a prefix allows
270
+ // enumerating all entries with that prefix:
271
+ // * "Prefix starts the group": Compare(prefix(key), key) <= 0
272
+ // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
255
273
  //
256
274
  // Default: nullptr
257
275
  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
@@ -483,11 +501,17 @@ struct DBOptions {
483
501
  bool flush_verify_memtable_count = true;
484
502
 
485
503
  // If true, the log numbers and sizes of the synced WALs are tracked
486
- // in MANIFEST, then during DB recovery, if a synced WAL is missing
504
+ // in MANIFEST. During DB recovery, if a synced WAL is missing
487
505
  // from disk, or the WAL's size does not match the recorded size in
488
506
  // MANIFEST, an error will be reported and the recovery will be aborted.
489
507
  //
508
+ // This is one additional protection against WAL corruption besides the
509
+ // per-WAL-entry checksum.
510
+ //
490
511
  // Note that this option does not work with secondary instance.
512
+ // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`,
513
+ // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not
514
+ // tracked for performance/efficiency reasons.
491
515
  //
492
516
  // Default: false
493
517
  bool track_and_verify_wals_in_manifest = false;
@@ -84,6 +84,13 @@ struct PerfContext {
84
84
  uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet
85
85
  uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator
86
86
 
87
+ uint64_t blob_cache_hit_count; // total number of blob cache hits
88
+ uint64_t blob_read_count; // total number of blob reads (with IO)
89
+ uint64_t blob_read_byte; // total number of bytes from blob reads
90
+ uint64_t blob_read_time; // total nanos spent on blob reads
91
+ uint64_t blob_checksum_time; // total nanos spent on blob checksum
92
+ uint64_t blob_decompress_time; // total nanos spent on blob decompression
93
+
87
94
  // total number of internal keys skipped over during iteration.
88
95
  // There are several reasons for it:
89
96
  // 1. when calling Next(), the iterator is in the position of the previous
@@ -434,6 +434,20 @@ enum Tickers : uint32_t {
434
434
  BLOCK_CHECKSUM_COMPUTE_COUNT,
435
435
  MULTIGET_COROUTINE_COUNT,
436
436
 
437
+ // Integrated BlobDB specific stats
438
+ // # of times cache miss when accessing blob from blob cache.
439
+ BLOB_DB_CACHE_MISS,
440
+ // # of times cache hit when accessing blob from blob cache.
441
+ BLOB_DB_CACHE_HIT,
442
+ // # of data blocks added to blob cache.
443
+ BLOB_DB_CACHE_ADD,
444
+ // # of failures when adding blobs to blob cache.
445
+ BLOB_DB_CACHE_ADD_FAILURES,
446
+ // # of bytes read from blob cache.
447
+ BLOB_DB_CACHE_BYTES_READ,
448
+ // # of bytes written into blob cache.
449
+ BLOB_DB_CACHE_BYTES_WRITE,
450
+
437
451
  TICKER_ENUM_MAX
438
452
  };
439
453
 
@@ -49,6 +49,15 @@ class ThreadPool {
49
49
  virtual void SubmitJob(const std::function<void()>&) = 0;
50
50
  // This moves the function in for efficiency
51
51
  virtual void SubmitJob(std::function<void()>&&) = 0;
52
+
53
+ // Reserve available background threads. This function does not ensure
54
+ // so many threads can be reserved, instead it will return the number of
55
+ // threads that can be reserved against the desired one. In other words,
56
+ // the number of available threads could be less than the input.
57
+ virtual int ReserveThreads(int /*threads_to_be_reserved*/) { return 0; }
58
+
59
+ // Release a specific number of reserved threads
60
+ virtual int ReleaseThreads(int /*threads_to_be_released*/) { return 0; }
52
61
  };
53
62
 
54
63
  // NewThreadPool() is a function that could be used to create a ThreadPool
@@ -447,25 +447,12 @@ class WriteBatch : public WriteBatchBase {
447
447
  // the WAL.
448
448
  SavePoint wal_term_point_;
449
449
 
450
- // For HasXYZ. Mutable to allow lazy computation of results
451
- mutable std::atomic<uint32_t> content_flags_;
452
-
453
- // Performs deferred computation of content_flags if necessary
454
- uint32_t ComputeContentFlags() const;
455
-
456
- // Maximum size of rep_.
457
- size_t max_bytes_;
458
-
459
450
  // Is the content of the batch the application's latest state that meant only
460
451
  // to be used for recovery? Refer to
461
452
  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
462
453
  // more details.
463
454
  bool is_latest_persistent_state_ = false;
464
455
 
465
- std::unique_ptr<ProtectionInfo> prot_info_;
466
-
467
- size_t default_cf_ts_sz_ = 0;
468
-
469
456
  // False if all keys are from column families that disable user-defined
470
457
  // timestamp OR UpdateTimestamps() has been called at least once.
471
458
  // This flag will be set to true if any of the above Put(), Delete(),
@@ -479,6 +466,19 @@ class WriteBatch : public WriteBatchBase {
479
466
  // that enables user-defined timestamp.
480
467
  bool has_key_with_ts_ = false;
481
468
 
469
+ // For HasXYZ. Mutable to allow lazy computation of results
470
+ mutable std::atomic<uint32_t> content_flags_;
471
+
472
+ // Performs deferred computation of content_flags if necessary
473
+ uint32_t ComputeContentFlags() const;
474
+
475
+ // Maximum size of rep_.
476
+ size_t max_bytes_;
477
+
478
+ std::unique_ptr<ProtectionInfo> prot_info_;
479
+
480
+ size_t default_cf_ts_sz_ = 0;
481
+
482
482
  protected:
483
483
  std::string rep_; // See comment in write_batch.cc for the format of rep_
484
484
  };
@@ -278,11 +278,21 @@ Status CreateLoggerFromOptions(const std::string& dbname,
278
278
  InfoLogFileName(dbname, db_absolute_path, options.db_log_dir);
279
279
 
280
280
  const auto& clock = env->GetSystemClock();
281
- // In case it does not exist
281
+ // In case it does not exist.
282
282
  s = env->CreateDirIfMissing(dbname);
283
283
  if (!s.ok()) {
284
- return s;
284
+ if (options.db_log_dir.empty()) {
285
+ return s;
286
+ } else {
287
+ // Ignore the error returned during creation of dbname because dbname and
288
+ // db_log_dir can be on different filesystems in which case dbname will
289
+ // not exist and error should be ignored. db_log_dir creation will handle
290
+ // the error in case there is any error in the creation of dbname on same
291
+ // filesystem.
292
+ s = Status::OK();
293
+ }
285
294
  }
295
+ assert(s.ok());
286
296
 
287
297
  if (!options.db_log_dir.empty()) {
288
298
  s = env->CreateDirIfMissing(options.db_log_dir);
@@ -47,6 +47,14 @@ PerfContext::PerfContext(const PerfContext& other) {
47
47
  get_read_bytes = other.get_read_bytes;
48
48
  multiget_read_bytes = other.multiget_read_bytes;
49
49
  iter_read_bytes = other.iter_read_bytes;
50
+
51
+ blob_cache_hit_count = other.blob_cache_hit_count;
52
+ blob_read_count = other.blob_read_count;
53
+ blob_read_byte = other.blob_read_byte;
54
+ blob_read_time = other.blob_read_time;
55
+ blob_checksum_time = other.blob_checksum_time;
56
+ blob_decompress_time = other.blob_decompress_time;
57
+
50
58
  internal_key_skipped_count = other.internal_key_skipped_count;
51
59
  internal_delete_skipped_count = other.internal_delete_skipped_count;
52
60
  internal_recent_skipped_count = other.internal_recent_skipped_count;
@@ -146,6 +154,14 @@ PerfContext::PerfContext(PerfContext&& other) noexcept {
146
154
  get_read_bytes = other.get_read_bytes;
147
155
  multiget_read_bytes = other.multiget_read_bytes;
148
156
  iter_read_bytes = other.iter_read_bytes;
157
+
158
+ blob_cache_hit_count = other.blob_cache_hit_count;
159
+ blob_read_count = other.blob_read_count;
160
+ blob_read_byte = other.blob_read_byte;
161
+ blob_read_time = other.blob_read_time;
162
+ blob_checksum_time = other.blob_checksum_time;
163
+ blob_decompress_time = other.blob_decompress_time;
164
+
149
165
  internal_key_skipped_count = other.internal_key_skipped_count;
150
166
  internal_delete_skipped_count = other.internal_delete_skipped_count;
151
167
  internal_recent_skipped_count = other.internal_recent_skipped_count;
@@ -247,6 +263,14 @@ PerfContext& PerfContext::operator=(const PerfContext& other) {
247
263
  get_read_bytes = other.get_read_bytes;
248
264
  multiget_read_bytes = other.multiget_read_bytes;
249
265
  iter_read_bytes = other.iter_read_bytes;
266
+
267
+ blob_cache_hit_count = other.blob_cache_hit_count;
268
+ blob_read_count = other.blob_read_count;
269
+ blob_read_byte = other.blob_read_byte;
270
+ blob_read_time = other.blob_read_time;
271
+ blob_checksum_time = other.blob_checksum_time;
272
+ blob_decompress_time = other.blob_decompress_time;
273
+
250
274
  internal_key_skipped_count = other.internal_key_skipped_count;
251
275
  internal_delete_skipped_count = other.internal_delete_skipped_count;
252
276
  internal_recent_skipped_count = other.internal_recent_skipped_count;
@@ -345,6 +369,14 @@ void PerfContext::Reset() {
345
369
  get_read_bytes = 0;
346
370
  multiget_read_bytes = 0;
347
371
  iter_read_bytes = 0;
372
+
373
+ blob_cache_hit_count = 0;
374
+ blob_read_count = 0;
375
+ blob_read_byte = 0;
376
+ blob_read_time = 0;
377
+ blob_checksum_time = 0;
378
+ blob_decompress_time = 0;
379
+
348
380
  internal_key_skipped_count = 0;
349
381
  internal_delete_skipped_count = 0;
350
382
  internal_recent_skipped_count = 0;
@@ -467,6 +499,12 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
467
499
  PERF_CONTEXT_OUTPUT(get_read_bytes);
468
500
  PERF_CONTEXT_OUTPUT(multiget_read_bytes);
469
501
  PERF_CONTEXT_OUTPUT(iter_read_bytes);
502
+ PERF_CONTEXT_OUTPUT(blob_cache_hit_count);
503
+ PERF_CONTEXT_OUTPUT(blob_read_count);
504
+ PERF_CONTEXT_OUTPUT(blob_read_byte);
505
+ PERF_CONTEXT_OUTPUT(blob_read_time);
506
+ PERF_CONTEXT_OUTPUT(blob_checksum_time);
507
+ PERF_CONTEXT_OUTPUT(blob_decompress_time);
470
508
  PERF_CONTEXT_OUTPUT(internal_key_skipped_count);
471
509
  PERF_CONTEXT_OUTPUT(internal_delete_skipped_count);
472
510
  PERF_CONTEXT_OUTPUT(internal_recent_skipped_count);
@@ -227,7 +227,13 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
227
227
  {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"},
228
228
  {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"},
229
229
  {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"},
230
- {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"}};
230
+ {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"},
231
+ {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"},
232
+ {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"},
233
+ {BLOB_DB_CACHE_ADD, "rocksdb.blobdb.cache.add"},
234
+ {BLOB_DB_CACHE_ADD_FAILURES, "rocksdb.blobdb.cache.add.failures"},
235
+ {BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"},
236
+ {BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"}};
231
237
 
232
238
  const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
233
239
  {DB_GET, "rocksdb.db.get.micros"},