@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/binding.cc +32 -2
  2. package/binding.gyp +8 -0
  3. package/deps/liburing/liburing.gyp +20 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +7 -0
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
  8. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
  10. package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
  12. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
  19. package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
  20. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
  23. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
  25. package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
  26. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
  30. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
  31. package/deps/rocksdb/rocksdb/db/c.cc +90 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
  33. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  34. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
  42. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
  43. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
  44. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
  45. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
  52. package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
  53. package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
  54. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
  55. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
  57. package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
  58. package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
  59. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
  61. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
  62. package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
  63. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  64. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
  65. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
  66. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
  67. package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
  68. package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
  69. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  70. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  71. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
  72. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
  73. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
  74. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
  75. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
  76. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
  78. package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
  80. package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
  82. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
  89. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
  94. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
  95. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
  96. package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
  97. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
  98. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
  99. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
  100. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
  101. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
  102. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
  104. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
  105. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
  106. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
  108. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
  109. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
  110. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  113. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
  114. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
  115. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
  116. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
  117. package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
  118. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
  119. package/deps/rocksdb/rocksdb/src.mk +4 -0
  120. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
  121. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
  122. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
  124. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
  125. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
  126. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  128. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
  129. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  131. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
  132. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
  133. package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
  134. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
  135. package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
  136. package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
  137. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
  138. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
  139. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
  140. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
  141. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
  143. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
  144. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
  145. package/deps/rocksdb/rocksdb.gyp +7 -1
  146. package/package.json +1 -1
  147. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -76,6 +76,8 @@ typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t;
76
76
  typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
77
77
  typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t;
78
78
  typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t;
79
+ typedef struct rocksdb_hyper_clock_cache_options_t
80
+ rocksdb_hyper_clock_cache_options_t;
79
81
  typedef struct rocksdb_cache_t rocksdb_cache_t;
80
82
  typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
81
83
  typedef struct rocksdb_compactionfiltercontext_t
@@ -597,13 +599,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot(
597
599
  extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db,
598
600
  const char* propname);
599
601
  /* returns 0 on success, -1 otherwise */
600
- int rocksdb_property_int(rocksdb_t* db, const char* propname,
601
- uint64_t* out_val);
602
+ extern ROCKSDB_LIBRARY_API int rocksdb_property_int(rocksdb_t* db,
603
+ const char* propname,
604
+ uint64_t* out_val);
602
605
 
603
606
  /* returns 0 on success, -1 otherwise */
604
- int rocksdb_property_int_cf(rocksdb_t* db,
605
- rocksdb_column_family_handle_t* column_family,
606
- const char* propname, uint64_t* out_val);
607
+ extern ROCKSDB_LIBRARY_API int rocksdb_property_int_cf(
608
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
609
+ const char* propname, uint64_t* out_val);
607
610
 
608
611
  extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
609
612
  rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
@@ -662,6 +665,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf(
662
665
  rocksdb_t* db, const rocksdb_flushoptions_t* options,
663
666
  rocksdb_column_family_handle_t* column_family, char** errptr);
664
667
 
668
+ extern ROCKSDB_LIBRARY_API void rocksdb_flush_cfs(
669
+ rocksdb_t* db, const rocksdb_flushoptions_t* options,
670
+ rocksdb_column_family_handle_t** column_family, int num_column_families,
671
+ char** errptr);
672
+
665
673
  extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db,
666
674
  unsigned char sync,
667
675
  char** errptr);
@@ -2012,6 +2020,29 @@ rocksdb_cache_get_usage(rocksdb_cache_t* cache);
2012
2020
  extern ROCKSDB_LIBRARY_API size_t
2013
2021
  rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
2014
2022
 
2023
+ /* HyperClockCache */
2024
+ extern ROCKSDB_LIBRARY_API rocksdb_hyper_clock_cache_options_t*
2025
+ rocksdb_hyper_clock_cache_options_create(size_t capacity,
2026
+ size_t estimated_entry_charge);
2027
+ extern ROCKSDB_LIBRARY_API void rocksdb_hyper_clock_cache_options_destroy(
2028
+ rocksdb_hyper_clock_cache_options_t*);
2029
+ extern ROCKSDB_LIBRARY_API void rocksdb_hyper_clock_cache_options_set_capacity(
2030
+ rocksdb_hyper_clock_cache_options_t*, size_t);
2031
+ extern ROCKSDB_LIBRARY_API void
2032
+ rocksdb_hyper_clock_cache_options_set_estimated_entry_charge(
2033
+ rocksdb_hyper_clock_cache_options_t*, size_t);
2034
+ extern ROCKSDB_LIBRARY_API void
2035
+ rocksdb_hyper_clock_cache_options_set_num_shard_bits(
2036
+ rocksdb_hyper_clock_cache_options_t*, int);
2037
+ extern ROCKSDB_LIBRARY_API void
2038
+ rocksdb_hyper_clock_cache_options_set_memory_allocator(
2039
+ rocksdb_hyper_clock_cache_options_t*, rocksdb_memory_allocator_t*);
2040
+
2041
+ extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_hyper_clock(
2042
+ size_t capacity, size_t estimated_entry_charge);
2043
+ extern ROCKSDB_LIBRARY_API rocksdb_cache_t*
2044
+ rocksdb_cache_create_hyper_clock_opts(rocksdb_hyper_clock_cache_options_t*);
2045
+
2015
2046
  /* DBPath */
2016
2047
 
2017
2048
  extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(
@@ -2116,6 +2147,11 @@ rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
2116
2147
  extern ROCKSDB_LIBRARY_API void
2117
2148
  rocksdb_ingestexternalfileoptions_set_ingest_behind(
2118
2149
  rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind);
2150
+ extern ROCKSDB_LIBRARY_API void
2151
+ rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level(
2152
+ rocksdb_ingestexternalfileoptions_t* opt,
2153
+ unsigned char fail_if_not_bottommost_level);
2154
+
2119
2155
  extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy(
2120
2156
  rocksdb_ingestexternalfileoptions_t* opt);
2121
2157
 
@@ -2198,6 +2234,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
2198
2234
  extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
2199
2235
  rocksdb_fifo_compaction_options_create(void);
2200
2236
  extern ROCKSDB_LIBRARY_API void
2237
+ rocksdb_fifo_compaction_options_set_allow_compaction(
2238
+ rocksdb_fifo_compaction_options_t* fifo_opts, unsigned char allow_compaction);
2239
+ extern ROCKSDB_LIBRARY_API unsigned char
2240
+ rocksdb_fifo_compaction_options_get_allow_compaction(
2241
+ rocksdb_fifo_compaction_options_t* fifo_opts);
2242
+ extern ROCKSDB_LIBRARY_API void
2201
2243
  rocksdb_fifo_compaction_options_set_max_table_files_size(
2202
2244
  rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
2203
2245
  extern ROCKSDB_LIBRARY_API uint64_t
@@ -2622,6 +2664,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf(
2622
2664
  rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
2623
2665
  rocksdb_column_family_handle_t* column_family, char** errptr);
2624
2666
 
2667
+ extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cfs(
2668
+ rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
2669
+ rocksdb_column_family_handle_t** column_families, int num_column_families,
2670
+ char** errptr);
2671
+
2625
2672
  extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal(
2626
2673
  rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr);
2627
2674
 
@@ -135,6 +135,9 @@ struct ShardedCacheOptions {
135
135
  CacheMetadataChargePolicy metadata_charge_policy =
136
136
  kDefaultCacheMetadataChargePolicy;
137
137
 
138
+ // A SecondaryCache instance to use the non-volatile tier.
139
+ std::shared_ptr<SecondaryCache> secondary_cache;
140
+
138
141
  ShardedCacheOptions() {}
139
142
  ShardedCacheOptions(
140
143
  size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
@@ -182,9 +185,6 @@ struct LRUCacheOptions : public ShardedCacheOptions {
182
185
  // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
183
186
  bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
184
187
 
185
- // A SecondaryCache instance to use a the non-volatile tier.
186
- std::shared_ptr<SecondaryCache> secondary_cache;
187
-
188
188
  LRUCacheOptions() {}
189
189
  LRUCacheOptions(size_t _capacity, int _num_shard_bits,
190
190
  bool _strict_capacity_limit, double _high_pri_pool_ratio,
@@ -27,30 +27,129 @@ class SliceTransform;
27
27
  // CompactionFilter allows an application to modify/delete a key-value during
28
28
  // table file creation.
29
29
  //
30
- // Exceptions MUST NOT propagate out of overridden functions into RocksDB,
30
+ // Some general notes:
31
+ //
32
+ // * RocksDB snapshots do not guarantee to preserve the state of the DB in the
33
+ // presence of CompactionFilter. Data seen from a snapshot might disappear after
34
+ // a table file created with a `CompactionFilter` is installed. If you use
35
+ // snapshots, think twice about whether you want to use `CompactionFilter` and
36
+ // whether you are using it in a safe way.
37
+ //
38
+ // * If multithreaded compaction is being used *and* a single CompactionFilter
39
+ // instance was supplied via Options::compaction_filter, CompactionFilter
40
+ // methods may be called from different threads concurrently. The application
41
+ // must ensure that such calls are thread-safe. If the CompactionFilter was
42
+ // created by a factory, then it will only ever be used by a single thread that
43
+ // is doing the table file creation, and this call does not need to be
44
+ // thread-safe. However, multiple filters may be in existence and operating
45
+ // concurrently.
46
+ //
47
+ // * The key passed to the filtering methods includes the timestamp if
48
+ // user-defined timestamps are enabled.
49
+ //
50
+ // * Exceptions MUST NOT propagate out of overridden functions into RocksDB,
31
51
  // because RocksDB is not exception-safe. This could cause undefined behavior
32
52
  // including data loss, unreported corruption, deadlocks, and more.
33
53
  class CompactionFilter : public Customizable {
34
54
  public:
55
+ // Value type of the key-value passed to the compaction filter's FilterV2/V3
56
+ // methods.
35
57
  enum ValueType {
58
+ // Plain key-value
36
59
  kValue,
60
+ // Merge operand
37
61
  kMergeOperand,
38
- kBlobIndex, // used internally by BlobDB.
62
+ // Used internally by the old stacked BlobDB implementation; this value type
63
+ // is never passed to application code. Note that when using the new
64
+ // integrated BlobDB, values stored separately as blobs are retrieved and
65
+ // presented to FilterV2/V3 with the type kValue above.
66
+ kBlobIndex,
67
+ // Wide-column entity
39
68
  kWideColumnEntity,
40
69
  };
41
70
 
71
+ // Potential decisions that can be returned by the compaction filter's
72
+ // FilterV2/V3 and FilterBlobByKey methods. See decision-specific caveats and
73
+ // constraints below.
42
74
  enum class Decision {
75
+ // Keep the current key-value as-is.
43
76
  kKeep,
77
+
78
+ // Remove the current key-value. Note that the semantics of removal are
79
+ // dependent on the value type. If the current key-value is a plain
80
+ // key-value or a wide-column entity, it is converted to a tombstone
81
+ // (Delete), resulting in the deletion of any earlier versions of the key.
82
+ // If it is a merge operand, it is simply dropped. Note: if you are using
83
+ // a TransactionDB, it is not recommended to filter out merge operands.
84
+ // If a Merge operation is filtered out, TransactionDB may not realize there
85
+ // is a write conflict and may allow a Transaction that should have failed
86
+ // to Commit. Instead, it is better to implement any Merge filtering inside
87
+ // the MergeOperator.
44
88
  kRemove,
89
+
90
+ // Change the value of the current key-value. If the current key-value is a
91
+ // plain key-value or a merge operand, its value is updated but its value
92
+ // type remains the same. If the current key-value is a wide-column entity,
93
+ // it is converted to a plain key-value with the new value specified.
45
94
  kChangeValue,
95
+
96
+ // Remove all key-values with key in [key, *skip_until). This range of keys
97
+ // will be skipped in a way that potentially avoids some IO operations
98
+ // compared to removing the keys one by one. Note that removal in this case
99
+ // means dropping the key-value regardless of value type; in other words, in
100
+ // contrast with kRemove, plain values and entities are not converted to
101
+ // tombstones.
102
+ //
103
+ // *skip_until <= key is treated the same as Decision::kKeep (since the
104
+ // range [key, *skip_until) is empty).
105
+ //
106
+ // Caveats:
107
+ // * The keys are skipped even if there are snapshots containing them,
108
+ // i.e. values removed by kRemoveAndSkipUntil can disappear from a
109
+ // snapshot - beware if you're using TransactionDB or DB::GetSnapshot().
110
+ // * If value for a key was overwritten or merged into (multiple Put()s
111
+ // or Merge()s), and `CompactionFilter` skips this key with
112
+ // kRemoveAndSkipUntil, it's possible that it will remove only
113
+ // the new value, exposing the old value that was supposed to be
114
+ // overwritten.
115
+ // * Doesn't work with PlainTableFactory in prefix mode.
116
+ // * If you use kRemoveAndSkipUntil for table files created by compaction,
117
+ // consider also reducing compaction_readahead_size option.
46
118
  kRemoveAndSkipUntil,
47
- kChangeBlobIndex, // used internally by BlobDB.
48
- kIOError, // used internally by BlobDB.
49
- kPurge, // used for keys that can only be SingleDelete'ed
119
+
120
+ // Used internally by the old stacked BlobDB implementation. Returning this
121
+ // decision from application code is not supported.
122
+ kChangeBlobIndex,
123
+
124
+ // Used internally by the old stacked BlobDB implementation. Returning this
125
+ // decision from application code is not supported.
126
+ kIOError,
127
+
128
+ // Remove the current key-value by converting it to a SingleDelete-type
129
+ // tombstone. Only supported for plain-key values and wide-column entities;
130
+ // not supported for merge operands. All the caveats related to
131
+ // SingleDeletes apply.
132
+ kPurge,
133
+
134
+ // Change the current key-value to the wide-column entity specified. If the
135
+ // current key-value is already a wide-column entity, only its columns are
136
+ // updated; if it is a plain key-value, it is converted to a wide-column
137
+ // entity with the specified columns. Not supported for merge operands.
138
+ // Only applicable to FilterV3.
50
139
  kChangeWideColumnEntity,
140
+
141
+ // When using the integrated BlobDB implementation, it may be possible for
142
+ // applications to make a filtering decision for a given blob based on
143
+ // the key only without actually reading the blob value, which saves some
144
+ // I/O; see the FilterBlobByKey method below. Returning kUndetermined from
145
+ // FilterBlobByKey signals that making a decision solely based on the
146
+ // key is not possible; in this case, RocksDB reads the blob value and
147
+ // passes the key-value to the regular filtering method. Only applicable to
148
+ // FilterBlobByKey; returning this value from FilterV2/V3 is not supported.
51
149
  kUndetermined,
52
150
  };
53
151
 
152
+ // Used internally by the old stacked BlobDB implementation.
54
153
  enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
55
154
 
56
155
  // Context information for a table file creation.
@@ -76,8 +175,8 @@ class CompactionFilter : public Customizable {
76
175
  // The table file creation process invokes this method before adding a kv to
77
176
  // the table file. A return value of false indicates that the kv should be
78
177
  // preserved in the new table file and a return value of true indicates
79
- // that this key-value should be removed from the new table file. The
80
- // application can inspect the existing value of the key and make decision
178
+ // that this key-value should be removed (that is, converted to a tombstone).
179
+ // The application can inspect the existing value of the key and make decision
81
180
  // based on it.
82
181
  //
83
182
  // Key-Values that are results of merge operation during table file creation
@@ -88,23 +187,6 @@ class CompactionFilter : public Customizable {
88
187
  // When the value is to be preserved, the application has the option
89
188
  // to modify the existing_value and pass it back through new_value.
90
189
  // value_changed needs to be set to true in this case.
91
- //
92
- // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
93
- // DB* object) will not guarantee to preserve the state of the DB with
94
- // CompactionFilter. Data seen from a snapshot might disappear after a
95
- // table file created with a `CompactionFilter` is installed. If you use
96
- // snapshots, think twice about whether you want to use `CompactionFilter` and
97
- // whether you are using it in a safe way.
98
- //
99
- // If multithreaded compaction is being used *and* a single CompactionFilter
100
- // instance was supplied via Options::compaction_filter, this method may be
101
- // called from different threads concurrently. The application must ensure
102
- // that the call is thread-safe.
103
- //
104
- // If the CompactionFilter was created by a factory, then it will only ever
105
- // be used by a single thread that is doing the table file creation, and this
106
- // call does not need to be thread-safe. However, multiple filters may be
107
- // in existence and operating concurrently.
108
190
  virtual bool Filter(int /*level*/, const Slice& /*key*/,
109
191
  const Slice& /*existing_value*/,
110
192
  std::string* /*new_value*/,
@@ -126,48 +208,18 @@ class CompactionFilter : public Customizable {
126
208
  return false;
127
209
  }
128
210
 
129
- // An extended API. Called for both values and merge operands.
130
- // Allows changing value and skipping ranges of keys.
211
+ // A unified API for plain values and merge operands that may
212
+ // return a variety of decisions (see Decision above). The `value_type`
213
+ // parameter indicates the type of the key-value and the `existing_value`
214
+ // contains the current value or merge operand. The `new_value` output
215
+ // parameter can be used to set the updated value or merge operand when the
216
+ // kChangeValue decision is made by the filter. See the description of
217
+ // kRemoveAndSkipUntil above for the semantics of the `skip_until` output
218
+ // parameter, and see Decision above for more information on the semantics of
219
+ // the potential return values.
220
+ //
131
221
  // The default implementation uses Filter() and FilterMergeOperand().
132
222
  // If you're overriding this method, no need to override the other two.
133
- // `value_type` indicates whether this key-value corresponds to a normal
134
- // value (e.g. written with Put()) or a merge operand (written with Merge()).
135
- //
136
- // Possible return values:
137
- // * kKeep - keep the key-value pair.
138
- // * kRemove - remove the key-value pair or merge operand.
139
- // * kChangeValue - keep the key and change the value/operand to *new_value.
140
- // * kRemoveAndSkipUntil - remove this key-value pair, and also remove
141
- // all key-value pairs with key in [key, *skip_until). This range
142
- // of keys will be skipped without reading, potentially saving some
143
- // IO operations compared to removing the keys one by one.
144
- //
145
- // *skip_until <= key is treated the same as Decision::kKeep
146
- // (since the range [key, *skip_until) is empty).
147
- //
148
- // Caveats:
149
- // - The keys are skipped even if there are snapshots containing them,
150
- // i.e. values removed by kRemoveAndSkipUntil can disappear from a
151
- // snapshot - beware if you're using TransactionDB or
152
- // DB::GetSnapshot().
153
- // - If value for a key was overwritten or merged into (multiple Put()s
154
- // or Merge()s), and `CompactionFilter` skips this key with
155
- // kRemoveAndSkipUntil, it's possible that it will remove only
156
- // the new value, exposing the old value that was supposed to be
157
- // overwritten.
158
- // - Doesn't work with PlainTableFactory in prefix mode.
159
- // - If you use kRemoveAndSkipUntil for table files created by
160
- // compaction, consider also reducing compaction_readahead_size
161
- // option.
162
- //
163
- // Should never return kUndetermined.
164
- // Note: If you are using a TransactionDB, it is not recommended to filter
165
- // out or modify merge operands (ValueType::kMergeOperand).
166
- // If a merge operation is filtered out, TransactionDB may not realize there
167
- // is a write conflict and may allow a Transaction to Commit that should have
168
- // failed. Instead, it is better to implement any Merge filtering inside the
169
- // MergeOperator.
170
- // key includes timestamp if user-defined timestamp is enabled.
171
223
  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
172
224
  const Slice& existing_value, std::string* new_value,
173
225
  std::string* /*skip_until*/) const {
@@ -195,17 +247,21 @@ class CompactionFilter : public Customizable {
195
247
  }
196
248
  }
197
249
 
198
- // Wide column aware API. Called for plain values, merge operands, and
250
+ // Wide column aware unified API. Called for plain values, merge operands, and
199
251
  // wide-column entities; the `value_type` parameter indicates the type of the
200
252
  // key-value. When the key-value is a plain value or a merge operand, the
201
253
  // `existing_value` parameter contains the existing value and the
202
254
  // `existing_columns` parameter is invalid (nullptr). When the key-value is a
203
255
  // wide-column entity, the `existing_columns` parameter contains the wide
204
256
  // columns of the existing entity and the `existing_value` parameter is
205
- // invalid (nullptr). The output parameters `new_value` and `new_columns` can
206
- // be used to change the value or wide columns of the key-value when
207
- // `kChangeValue` or `kChangeWideColumnEntity` is returned. See above for more
208
- // information on the semantics of the potential return values.
257
+ // invalid (nullptr). The `new_value` output parameter can be used to set the
258
+ // updated value or merge operand when the kChangeValue decision is made by
259
+ // the filter. The `new_columns` output parameter can be used to specify
260
+ // the pairs of column names and column values when the
261
+ // kChangeWideColumnEntity decision is returned. See the description of
262
+ // kRemoveAndSkipUntil above for the semantics of the `skip_until` output
263
+ // parameter, and see Decision above for more information on the semantics of
264
+ // the potential return values.
209
265
  //
210
266
  // For compatibility, the default implementation keeps all wide-column
211
267
  // entities, and falls back to FilterV2 for plain values and merge operands.
@@ -255,10 +311,15 @@ class CompactionFilter : public Customizable {
255
311
  virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; }
256
312
 
257
313
  // In the case of BlobDB, it may be possible to reach a decision with only
258
- // the key without reading the actual value. Keys whose value_type is
259
- // kBlobIndex will be checked by this method.
260
- // Returning kUndetermined will cause FilterV3() to be called to make a
261
- // decision as usual.
314
+ // the key without reading the actual value, saving some I/O operations.
315
+ // Keys where the value is stored separately in a blob file will be
316
+ // passed to this method. If the method returns a supported decision other
317
+ // than kUndetermined, it will be considered final and performed without
318
+ // reading the existing value. Returning kUndetermined will cause FilterV3()
319
+ // to be called to make a decision as usual. The output parameters
320
+ // `new_value` and `skip_until` are applicable to the decisions kChangeValue
321
+ // and kRemoveAndSkipUntil respectively, and have the same semantics as
322
+ // the corresponding parameters of FilterV2/V3.
262
323
  virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/,
263
324
  std::string* /*new_value*/,
264
325
  std::string* /*skip_until*/) const {
@@ -301,6 +301,18 @@ class DB {
301
301
  std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
302
302
  std::string trim_ts);
303
303
 
304
+ // Manually, synchronously attempt to resume DB writes after a write failure
305
+ // to the underlying filesystem. See
306
+ // https://github.com/facebook/rocksdb/wiki/Background-Error-Handling
307
+ //
308
+ // Returns OK if writes are successfully resumed, or there was no
309
+ // outstanding error to recover from. Returns underlying write error if
310
+ // it is not recoverable.
311
+ //
312
+ // WART: Does not mix well with auto-resume. Will return Busy if an
313
+ // auto-resume is in progress, without waiting for it to complete.
314
+ // See DBOptions::max_bgerror_resume_count and
315
+ // EventListener::OnErrorRecoveryBegin
304
316
  virtual Status Resume() { return Status::NotSupported(); }
305
317
 
306
318
  // Close the DB by releasing resources, closing files etc. This should be
@@ -941,6 +953,18 @@ class DB {
941
953
  // level, as well as the histogram of latency of single requests.
942
954
  static const std::string kCFFileHistogram;
943
955
 
956
+ // "rocksdb.cf-write-stall-stats" - returns a multi-line string or
957
+ // map with statistics on CF-scope write stalls for a given CF
958
+ // See`WriteStallStatsMapKeys` for structured representation of keys
959
+ // available in the map form.
960
+ static const std::string kCFWriteStallStats;
961
+
962
+ // "rocksdb.db-write-stall-stats" - returns a multi-line string or
963
+ // map with statistics on DB-scope write stalls
964
+ // See`WriteStallStatsMapKeys` for structured representation of keys
965
+ // available in the map form.
966
+ static const std::string kDBWriteStallStats;
967
+
944
968
  // "rocksdb.dbstats" - As a string property, returns a multi-line string
945
969
  // with general database stats, both cumulative (over the db's
946
970
  // lifetime) and interval (since the last retrieval of kDBStats).
@@ -1717,11 +1741,12 @@ class DB {
1717
1741
  const std::vector<IngestExternalFileArg>& args) = 0;
1718
1742
 
1719
1743
  // CreateColumnFamilyWithImport() will create a new column family with
1720
- // column_family_name and import external SST files specified in metadata into
1721
- // this column family.
1744
+ // column_family_name and import external SST files specified in `metadata`
1745
+ // into this column family.
1722
1746
  // (1) External SST files can be created using SstFileWriter.
1723
1747
  // (2) External SST files can be exported from a particular column family in
1724
- // an existing DB using Checkpoint::ExportColumnFamily.
1748
+ // an existing DB using Checkpoint::ExportColumnFamily. `metadata` should
1749
+ // be the output from Checkpoint::ExportColumnFamily.
1725
1750
  // Option in import_options specifies whether the external files are copied or
1726
1751
  // moved (default is copy). When option specifies copy, managing files at
1727
1752
  // external_file_path is caller's responsibility. When option specifies a
@@ -1860,6 +1885,24 @@ class DB {
1860
1885
  }
1861
1886
  };
1862
1887
 
1888
+ struct WriteStallStatsMapKeys {
1889
+ static const std::string& TotalStops();
1890
+ static const std::string& TotalDelays();
1891
+
1892
+ static const std::string& CFL0FileCountLimitDelaysWithOngoingCompaction();
1893
+ static const std::string& CFL0FileCountLimitStopsWithOngoingCompaction();
1894
+
1895
+ // REQUIRES:
1896
+ // `cause` isn't any of these: `WriteStallCause::kNone`,
1897
+ // `WriteStallCause::kCFScopeWriteStallCauseEnumMax`,
1898
+ // `WriteStallCause::kDBScopeWriteStallCauseEnumMax`
1899
+ //
1900
+ // REQUIRES:
1901
+ // `condition` isn't any of these: `WriteStallCondition::kNormal`
1902
+ static std::string CauseConditionCount(WriteStallCause cause,
1903
+ WriteStallCondition condition);
1904
+ };
1905
+
1863
1906
  // Overloaded operators for enum class SizeApproximationFlags.
1864
1907
  inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs,
1865
1908
  DB::SizeApproximationFlags rhs) {
@@ -682,6 +682,10 @@ class FileSystem : public Customizable {
682
682
  return IOStatus::OK();
683
683
  }
684
684
 
685
+ // Indicates to upper layers whether the FileSystem supports/uses async IO
686
+ // or not
687
+ virtual bool use_async_io() { return true; }
688
+
685
689
  // If you're adding methods here, remember to add them to EnvWrapper too.
686
690
 
687
691
  private:
@@ -1522,6 +1526,8 @@ class FileSystemWrapper : public FileSystem {
1522
1526
  return target_->AbortIO(io_handles);
1523
1527
  }
1524
1528
 
1529
+ virtual bool use_async_io() override { return target_->use_async_io(); }
1530
+
1525
1531
  protected:
1526
1532
  std::shared_ptr<FileSystem> target_;
1527
1533
  };
@@ -194,12 +194,6 @@ enum class BackgroundErrorReason {
194
194
  kManifestWriteNoWAL,
195
195
  };
196
196
 
197
- enum class WriteStallCondition {
198
- kNormal,
199
- kDelayed,
200
- kStopped,
201
- };
202
-
203
197
  struct WriteStallInfo {
204
198
  // the name of the column family
205
199
  std::string cf_name;
@@ -148,6 +148,13 @@ struct SstFileMetaData : public FileStorageInfo {
148
148
  // For L0, larger `epoch_number` indicates newer L0 file.
149
149
  // 0 if the information is not available.
150
150
  uint64_t epoch_number = 0;
151
+
152
+ // These bounds define the effective key range for range tombstones
153
+ // in this file.
154
+ // Currently only used by CreateColumnFamilyWithImport().
155
+ std::string smallest{}; // Smallest internal key served by table
156
+ std::string largest{}; // Largest internal key served by table
157
+
151
158
  // DEPRECATED: The name of the file within its directory with a
152
159
  // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct
153
160
  // instead.
@@ -1311,12 +1311,12 @@ struct DBOptions {
1311
1311
  // Default: false
1312
1312
  bool best_efforts_recovery = false;
1313
1313
 
1314
- // It defines how many times db resume is called by a separate thread when
1314
+ // It defines how many times DB::Resume() is called by a separate thread when
1315
1315
  // background retryable IO Error happens. When background retryable IO
1316
1316
  // Error happens, SetBGError is called to deal with the error. If the error
1317
1317
  // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
1318
1318
  // then db resume is called in background to recover from the error. If this
1319
- // value is 0 or negative, db resume will not be called.
1319
+ // value is 0 or negative, DB::Resume() will not be called automatically.
1320
1320
  //
1321
1321
  // Default: INT_MAX
1322
1322
  int max_bgerror_resume_count = INT_MAX;
@@ -135,9 +135,14 @@ struct PerfContext {
135
135
  // than the snapshot that iterator is using.
136
136
  //
137
137
  uint64_t internal_recent_skipped_count;
138
- // How many values were fed into merge operator by iterators.
138
+ // How many merge operands were fed into the merge operator by iterators.
139
+ // Note: base values are not included in the count.
139
140
  //
140
141
  uint64_t internal_merge_count;
142
+ // How many merge operands were fed into the merge operator by point lookups.
143
+ // Note: base values are not included in the count.
144
+ //
145
+ uint64_t internal_merge_point_lookup_count;
141
146
  // Number of times we reseeked inside a merging iterator, specifically to skip
142
147
  // after or before a range of keys covered by a range deletion in a newer LSM
143
148
  // component.
@@ -99,12 +99,12 @@ class SecondaryCache : public Customizable {
99
99
  // needs to return true.
100
100
  // This hint can also be safely ignored.
101
101
  //
102
- // is_in_sec_cache is to indicate whether the handle is possibly erased
103
- // from the secondary cache after the Lookup.
102
+ // kept_in_sec_cache is to indicate whether the entry will be kept in the
103
+ // secondary cache after the Lookup (rather than erased because of Lookup)
104
104
  virtual std::unique_ptr<SecondaryCacheResultHandle> Lookup(
105
105
  const Slice& key, const Cache::CacheItemHelper* helper,
106
106
  Cache::CreateContext* create_context, bool wait, bool advise_erase,
107
- bool& is_in_sec_cache) = 0;
107
+ bool& kept_in_sec_cache) = 0;
108
108
 
109
109
  // Indicate whether a handle can be erased in this secondary cache.
110
110
  [[nodiscard]] virtual bool SupportForceErase() const = 0;
@@ -415,6 +415,20 @@ enum Tickers : uint32_t {
415
415
  // Number of errors returned to the async read callback
416
416
  ASYNC_READ_ERROR_COUNT,
417
417
 
418
+ // Fine grained secondary cache stats
419
+ SECONDARY_CACHE_FILTER_HITS,
420
+ SECONDARY_CACHE_INDEX_HITS,
421
+ SECONDARY_CACHE_DATA_HITS,
422
+
423
+ // Number of lookup into the prefetched tail (see
424
+ // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`)
425
+ // that can't find its data for table open
426
+ TABLE_OPEN_PREFETCH_TAIL_MISS,
427
+ // Number of lookup into the prefetched tail (see
428
+ // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`)
429
+ // that finds its data for table open
430
+ TABLE_OPEN_PREFETCH_TAIL_HIT,
431
+
418
432
  TICKER_ENUM_MAX
419
433
  };
420
434
 
@@ -528,6 +542,10 @@ enum Histograms : uint32_t {
528
542
  // Wait time for aborting async read in FilePrefetchBuffer destructor
529
543
  ASYNC_PREFETCH_ABORT_MICROS,
530
544
 
545
+ // Number of bytes read for RocksDB's prefetching contents (as opposed to file
546
+ // system's prefetch) from the end of SST table during block based table open
547
+ TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
548
+
531
549
  HISTOGRAM_ENUM_MAX
532
550
  };
533
551
 
@@ -63,4 +63,32 @@ enum EntryType {
63
63
  kEntryOther,
64
64
  };
65
65
 
66
+ enum class WriteStallCause {
67
+ // Beginning of CF-scope write stall causes
68
+ //
69
+ // Always keep `kMemtableLimit` as the first stat in this section
70
+ kMemtableLimit,
71
+ kL0FileCountLimit,
72
+ kPendingCompactionBytes,
73
+ kCFScopeWriteStallCauseEnumMax,
74
+ // End of CF-scope write stall causes
75
+
76
+ // Beginning of DB-scope write stall causes
77
+ //
78
+ // Always keep `kWriteBufferManagerLimit` as the first stat in this section
79
+ kWriteBufferManagerLimit,
80
+ kDBScopeWriteStallCauseEnumMax,
81
+ // End of DB-scope write stall causes
82
+
83
+ // Always add new WriteStallCause before `kNone`
84
+ kNone,
85
+ };
86
+
87
+ enum class WriteStallCondition {
88
+ kDelayed,
89
+ kStopped,
90
+ // Always add new WriteStallCondition before `kNormal`
91
+ kNormal,
92
+ };
93
+
66
94
  } // namespace ROCKSDB_NAMESPACE