@nxtedition/rocksdb 13.5.13 → 14.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/binding.cc +33 -2
  2. package/binding.gyp +2 -2
  3. package/chained-batch.js +9 -16
  4. package/deps/rocksdb/rocksdb/BUCK +18 -1
  5. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
  6. package/deps/rocksdb/rocksdb/Makefile +20 -9
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +90 -13
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +88 -75
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.h +44 -36
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +184 -148
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +5 -11
  12. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +116 -47
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +1 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +3 -6
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +1 -1
  16. package/deps/rocksdb/rocksdb/db/builder.cc +4 -2
  17. package/deps/rocksdb/rocksdb/db/c.cc +207 -0
  18. package/deps/rocksdb/rocksdb/db/c_test.c +72 -0
  19. package/deps/rocksdb/rocksdb/db/column_family.cc +3 -2
  20. package/deps/rocksdb/rocksdb/db/column_family.h +5 -0
  21. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +4 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +2 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +51 -38
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +29 -12
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +5 -10
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +566 -366
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +131 -4
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +1 -0
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +4 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +13 -14
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +12 -7
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -10
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +97 -76
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +11 -14
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +16 -3
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +1 -0
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +448 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +22 -20
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +4 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +5 -5
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +7 -3
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_iter.cc +104 -0
  47. package/deps/rocksdb/rocksdb/db/db_iter.h +4 -11
  48. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +331 -58
  49. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +129 -0
  50. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +64 -0
  51. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +40 -0
  52. package/deps/rocksdb/rocksdb/db/db_test2.cc +25 -15
  53. package/deps/rocksdb/rocksdb/db/db_test_util.cc +42 -24
  54. package/deps/rocksdb/rocksdb/db/db_test_util.h +29 -14
  55. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +69 -36
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +0 -1
  57. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  58. package/deps/rocksdb/rocksdb/db/experimental.cc +5 -4
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +8 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +275 -79
  61. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +23 -5
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +591 -175
  63. package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -4
  64. package/deps/rocksdb/rocksdb/db/log_reader.cc +5 -2
  65. package/deps/rocksdb/rocksdb/db/memtable.cc +84 -35
  66. package/deps/rocksdb/rocksdb/db/memtable.h +39 -34
  67. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -0
  68. package/deps/rocksdb/rocksdb/db/merge_operator.cc +1 -1
  69. package/deps/rocksdb/rocksdb/db/multi_scan.cc +11 -5
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +1 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -1
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +34 -14
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +28 -5
  74. package/deps/rocksdb/rocksdb/db/version_set.cc +159 -14
  75. package/deps/rocksdb/rocksdb/db/version_set.h +2 -0
  76. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -1
  77. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +60 -0
  78. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +16 -1
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_service.h +75 -10
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.cc +28 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compression_manager.h +2 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +31 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +50 -2
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +57 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h +0 -4
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +266 -35
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +0 -6
  89. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +18 -2
  90. package/deps/rocksdb/rocksdb/env/env.cc +12 -0
  91. package/deps/rocksdb/rocksdb/env/env_test.cc +18 -0
  92. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +2 -0
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +9 -5
  94. package/deps/rocksdb/rocksdb/env/io_posix.cc +4 -2
  95. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +19 -0
  96. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +33 -31
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +42 -9
  98. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +93 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +43 -49
  100. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +4 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compression_type.h +8 -6
  102. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +487 -0
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +11 -12
  104. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +135 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +12 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +1 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/ldb_tool.h +8 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +12 -8
  110. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +3 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/multi_scan.h +19 -9
  112. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +219 -24
  113. package/deps/rocksdb/rocksdb/include/rocksdb/point_lock_bench_tool.h +14 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +2 -2
  115. package/deps/rocksdb/rocksdb/include/rocksdb/slice.h +1 -1
  116. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +7 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +16 -0
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +16 -4
  119. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +13 -0
  120. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +4 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +0 -2
  122. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +45 -0
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h +1 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +1 -1
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +6 -1
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  128. package/deps/rocksdb/rocksdb/memory/memory_allocator_impl.h +3 -3
  129. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +77 -51
  130. package/deps/rocksdb/rocksdb/memtable/skiplist.h +10 -13
  131. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +16 -7
  132. package/deps/rocksdb/rocksdb/memtable/vectorrep.cc +9 -4
  133. package/deps/rocksdb/rocksdb/monitoring/iostats_context.cc +2 -0
  134. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +6 -0
  135. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -1
  136. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  137. package/deps/rocksdb/rocksdb/options/options.cc +2 -0
  138. package/deps/rocksdb/rocksdb/options/options_helper.cc +9 -8
  139. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -5
  140. package/deps/rocksdb/rocksdb/port/mmap.cc +1 -1
  141. package/deps/rocksdb/rocksdb/port/win/xpress_win.cc +51 -0
  142. package/deps/rocksdb/rocksdb/port/win/xpress_win.h +4 -0
  143. package/deps/rocksdb/rocksdb/src.mk +8 -2
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1125 -765
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +35 -24
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +29 -4
  147. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +732 -256
  148. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +225 -16
  149. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -26
  150. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -1
  151. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +2 -75
  152. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +433 -141
  153. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +2 -0
  154. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +17 -10
  155. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy_impl.h +20 -0
  156. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +112 -85
  157. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +191 -36
  158. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +2 -2
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  160. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +108 -31
  161. package/deps/rocksdb/rocksdb/table/external_table.cc +7 -3
  162. package/deps/rocksdb/rocksdb/table/format.cc +6 -12
  163. package/deps/rocksdb/rocksdb/table/format.h +10 -0
  164. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  165. package/deps/rocksdb/rocksdb/table/iterator_wrapper.h +1 -1
  166. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -1
  167. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +5 -0
  168. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -1
  169. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +118 -46
  170. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +9 -8
  171. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  172. package/deps/rocksdb/rocksdb/table/table_properties.cc +16 -0
  173. package/deps/rocksdb/rocksdb/table/table_test.cc +1540 -155
  174. package/deps/rocksdb/rocksdb/test_util/testutil.h +21 -5
  175. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +26 -5
  176. package/deps/rocksdb/rocksdb/tools/ldb.cc +1 -2
  177. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +2 -0
  178. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -3
  179. package/deps/rocksdb/rocksdb/tools/sst_dump_test.cc +133 -165
  180. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +173 -64
  181. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +69 -0
  182. package/deps/rocksdb/rocksdb/util/atomic.h +6 -0
  183. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +29 -20
  184. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +10 -6
  185. package/deps/rocksdb/rocksdb/util/bit_fields.h +338 -0
  186. package/deps/rocksdb/rocksdb/util/coding.h +3 -3
  187. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +2 -2
  188. package/deps/rocksdb/rocksdb/util/compression.cc +777 -82
  189. package/deps/rocksdb/rocksdb/util/compression.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/compression_test.cc +5 -3
  191. package/deps/rocksdb/rocksdb/util/dynamic_bloom.cc +2 -2
  192. package/deps/rocksdb/rocksdb/util/dynamic_bloom.h +15 -14
  193. package/deps/rocksdb/rocksdb/util/interval_test.cc +102 -0
  194. package/deps/rocksdb/rocksdb/util/semaphore.h +164 -0
  195. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +10 -6
  196. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -2
  197. package/deps/rocksdb/rocksdb/util/slice_test.cc +136 -0
  198. package/deps/rocksdb/rocksdb/util/status.cc +1 -0
  199. package/deps/rocksdb/rocksdb/util/string_util.cc +2 -16
  200. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +1 -1
  201. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -1
  202. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +7 -4
  203. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +35 -14
  204. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_test.cc +2 -0
  205. package/deps/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc +5 -2
  206. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/any_lock_manager_test.h +244 -0
  207. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench.cc +18 -0
  208. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_bench_tool.cc +159 -0
  209. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc +1244 -161
  210. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h +66 -12
  211. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_stress_test.cc +103 -0
  212. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc +1275 -8
  213. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h +40 -262
  214. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test_common.h +78 -0
  215. package/deps/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_validation_test_runner.h +469 -0
  216. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc +2 -6
  217. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +4 -0
  218. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +9 -1
  219. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +18 -9
  220. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +2 -0
  221. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc +2 -1
  222. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +72 -44
  223. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +92 -15
  224. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +6 -20
  225. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +143 -112
  226. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc +23 -16
  227. package/index.js +3 -3
  228. package/package.json +1 -1
  229. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  230. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  231. package/util.h +38 -12
  232. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc +0 -17
@@ -498,6 +498,7 @@ struct CompactionServiceJobInfo {
498
498
  // the output level of the compaction.
499
499
  int output_level;
500
500
 
501
+ CompactionServiceJobInfo() {}
501
502
  CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
502
503
  std::string db_session_id_, uint32_t cf_id_,
503
504
  std::string cf_name_, uint64_t job_id_,
@@ -622,6 +623,7 @@ struct DBOptions {
622
623
  // checking for corruption, including
623
624
  // * paranoid_file_checks
624
625
  // * paranoid_memory_checks
626
+ // * memtable_veirfy_per_key_checksum_on_seek
625
627
  // * DB::VerifyChecksum()
626
628
  //
627
629
  // Default: true
@@ -1360,16 +1362,11 @@ struct DBOptions {
1360
1362
  // Dynamically changeable through SetDBOptions() API.
1361
1363
  bool avoid_flush_during_shutdown = false;
1362
1364
 
1363
- // Set this option to true during creation of database if you want
1364
- // to be able to ingest behind (call IngestExternalFile() skipping keys
1365
- // that already exist, rather than overwriting matching keys).
1366
- // Setting this option to true has the following effects:
1367
- // 1) Disable some internal optimizations around SST file compression.
1368
- // 2) Reserve the last level for ingested files only.
1369
- // 3) Compaction will not include any file from the last level.
1370
- // Note that only Universal Compaction supports allow_ingest_behind.
1371
- // `num_levels` should be >= 3 if this option is turned on.
1365
+ // DEPRECATED: use ColumnFamilyOptions::cf_allow_ingest_behind instead.
1366
+ // This option might be removed in a future release.
1372
1367
  //
1368
+ // See comment for `ColumnFamilyOptions::cf_allow_ingest_behind` for
1369
+ // detail about the option's functionality and use cases.
1373
1370
  //
1374
1371
  // DEFAULT: false
1375
1372
  // Immutable.
@@ -1780,6 +1777,119 @@ struct ScanOptions {
1780
1777
  : range(_start, _upper_bound) {}
1781
1778
  };
1782
1779
 
1780
+ // Container for multiple scan ranges that can be used with MultiScan.
1781
+ // This replaces std::vector<ScanOptions> with a more efficient implementation
1782
+ // that can merge overlapping ranges.
1783
+ class MultiScanArgs {
1784
+ public:
1785
+ // Constructor that takes a comparator
1786
+ explicit MultiScanArgs(const Comparator* comparator) : comp_(comparator) {}
1787
+
1788
+ // Copy Constructor
1789
+ MultiScanArgs(const MultiScanArgs& other) {
1790
+ comp_ = other.comp_;
1791
+ original_ranges_ = other.original_ranges_;
1792
+ io_coalesce_threshold = other.io_coalesce_threshold;
1793
+ max_prefetch_size = other.max_prefetch_size;
1794
+ use_async_io = other.use_async_io;
1795
+ }
1796
+ MultiScanArgs(MultiScanArgs&& other) noexcept
1797
+ : io_coalesce_threshold(other.io_coalesce_threshold),
1798
+ max_prefetch_size(other.max_prefetch_size),
1799
+ use_async_io(other.use_async_io),
1800
+ comp_(other.comp_),
1801
+ original_ranges_(std::move(other.original_ranges_)) {}
1802
+
1803
+ MultiScanArgs& operator=(const MultiScanArgs& other) {
1804
+ comp_ = other.comp_;
1805
+ original_ranges_ = other.original_ranges_;
1806
+ io_coalesce_threshold = other.io_coalesce_threshold;
1807
+ max_prefetch_size = other.max_prefetch_size;
1808
+ use_async_io = other.use_async_io;
1809
+ return *this;
1810
+ }
1811
+
1812
+ MultiScanArgs& operator=(MultiScanArgs&& other) noexcept {
1813
+ if (this != &other) {
1814
+ comp_ = other.comp_;
1815
+ original_ranges_ = std::move(other.original_ranges_);
1816
+ io_coalesce_threshold = other.io_coalesce_threshold;
1817
+ max_prefetch_size = other.max_prefetch_size;
1818
+ use_async_io = other.use_async_io;
1819
+ }
1820
+ return *this;
1821
+ }
1822
+
1823
+ void insert(const Slice& s, const Slice& b) {
1824
+ original_ranges_.emplace_back(s, b);
1825
+ }
1826
+
1827
+ void insert(const Slice& s, const Slice& b,
1828
+ const std::optional<std::unordered_map<std::string, std::string>>&
1829
+ property_bag) {
1830
+ original_ranges_.emplace_back(s, b);
1831
+ original_ranges_.back().property_bag = property_bag;
1832
+ }
1833
+
1834
+ void insert(const Slice& s) { original_ranges_.emplace_back(s); }
1835
+
1836
+ void insert(const Slice& s,
1837
+ const std::optional<std::unordered_map<std::string, std::string>>&
1838
+ property_bag) {
1839
+ original_ranges_.emplace_back(s);
1840
+ original_ranges_.back().property_bag = property_bag;
1841
+ }
1842
+
1843
+ size_t size() const { return original_ranges_.size(); }
1844
+ bool empty() const { return original_ranges_.empty(); }
1845
+
1846
+ void reserve(size_t size) { original_ranges_.reserve(size); }
1847
+
1848
+ operator std::vector<ScanOptions>*() { return &original_ranges_; }
1849
+
1850
+ operator const std::vector<ScanOptions>*() const { return &original_ranges_; }
1851
+
1852
+ ~MultiScanArgs() {}
1853
+
1854
+ const std::vector<ScanOptions>& GetScanRanges() const {
1855
+ return original_ranges_;
1856
+ }
1857
+
1858
+ const Comparator* GetComparator() const { return comp_; }
1859
+
1860
+ // Copies the configurations (excluding actual scan ranges) from another
1861
+ // MultiScanArgs.
1862
+ void CopyConfigFrom(const MultiScanArgs& other) {
1863
+ io_coalesce_threshold = other.io_coalesce_threshold;
1864
+ max_prefetch_size = other.max_prefetch_size;
1865
+ use_async_io = other.use_async_io;
1866
+ }
1867
+
1868
+ uint64_t io_coalesce_threshold = 16 << 10; // 16KB by default
1869
+
1870
+ // Maximum size (in bytes) for the data blocks loaded by a MultiScan.
1871
+ // This limits the amount of I/O and memory usage by pinned data blocks.
1872
+ //
1873
+ // When set to 0 (the default), there is no limit. When the limit is reached,
1874
+ // the iterator will start returning Status::PrefetchLimitReached().
1875
+ //
1876
+ // Note that prefetching happens only once in Prepare(), which is different
1877
+ // from ReadOptions::readahead_size, which applies any time the iterator does
1878
+ // I/O.
1879
+ // Note that this limit is per file and applies to compressed block size.
1880
+ uint64_t max_prefetch_size = 0;
1881
+
1882
+ // Enable async I/O for multi-scan operations
1883
+ // When true, BlockBasedTableIterator will use ReadAsync() for reading blocks
1884
+ // When false, it will use synchronous MultiRead().
1885
+ bool use_async_io = false;
1886
+
1887
+ private:
1888
+ // The comparator used for ordering ranges
1889
+ const Comparator* comp_;
1890
+ std::vector<ScanOptions> original_ranges_;
1891
+ };
1892
+
1783
1893
  // Options that control read operations
1784
1894
  struct ReadOptions {
1785
1895
  // *** BEGIN options relevant to point lookups as well as scans ***
@@ -2344,7 +2454,47 @@ struct CompactRangeOptions {
2344
2454
  double blob_garbage_collection_age_cutoff = -1;
2345
2455
  };
2346
2456
 
2347
- // IngestExternalFileOptions is used by IngestExternalFile()
2457
+ // IngestExternalFileOptions setting guide:
2458
+ //
2459
+ // The options in IngestExternalFileOptions interact in complex ways depending
2460
+ // on the source and overlap of SST files. Below is a summary of recommended
2461
+ // non-default settings for common use cases:
2462
+ //
2463
+ // 1. Ingesting only SST writer generated non-overlapping SSTs that are not
2464
+ // expected to overlap with existing data:
2465
+ // - Optionally set fail_if_not_bottommost_level = true to enforce placement
2466
+ // in the last level. This is better paird with SST partitioner to guarantee
2467
+ // that there are no existing file with keys across the ingesting key range.
2468
+ // - Set allow_blocking_flush to false: Not expecting to overlap with
2469
+ // memtable and cause a flush.
2470
+ // - If snapshot consistency is not expected, set snapshot_consistency to
2471
+ // false and allow_global_seqno to false. allow_global_seqno = false will
2472
+ // fail ingestion if any input file overlap with each other.
2473
+ //
2474
+ // 2. Ingesting SST writer generated overlapping SSTs:
2475
+ // - order files with older updates first, newer overwrites later.
2476
+ // - Set allow_global_seqno = true since newer files need to be assigned
2477
+ // larger sequence numbers.
2478
+ //
2479
+ // 3. Ingesting DB generated SSTs: overlapping with target CF data is not
2480
+ // allowed. Input files are allowed to contain both DB generated files and SST
2481
+ // file writer generated files. They will all be treated as DB generated.
2482
+ // - Set allow_db_generated_files = true.
2483
+ // - Set snapshot_consistency = false: snapshot consistency requires
2484
+ // assigning higher sequence number to ingested files. DB generated files
2485
+ // don't support global seqno assignment yet.
2486
+ // - Set allow_blocking_flush to false: Not expecting to overlap with
2487
+ // memtable and cause a flush.
2488
+ // - If the source live DB is running, set link_files = true instead of
2489
+ // move_files.
2490
+ // 3a) SST files are non-overlapping and all keys have seqno 0: e.g., a
2491
+ // temporary RocksDB instance used to sort some data, and compacts all
2492
+ // data into the last level before ingestion.
2493
+ // - Optionally set fail_if_not_bottommost_level = true to enforce placement
2494
+ // in the last level.
2495
+ // 3b) SST files are overlapping, e.g. ingesting files from one CF to another.
2496
+ // - Ensure older updates are ordered first and newer updates are ordered
2497
+ // later. See more in option comment for allow_db_generated_files.
2348
2498
  struct IngestExternalFileOptions {
2349
2499
  // Can be set to true to move the files instead of copying them.
2350
2500
  // The input files will be unlinked after successful ingestion.
@@ -2361,10 +2511,20 @@ struct IngestExternalFileOptions {
2361
2511
  // If set to false, an ingested file keys could appear in existing snapshots
2362
2512
  // that where created before the file was ingested.
2363
2513
  bool snapshot_consistency = true;
2364
- // If set to false, IngestExternalFile() will fail if the file key range
2514
+ // Enables assiging a global sequence number to each ingested file, i.e.,
2515
+ // all keys in the ingested file will be treated as having this seqno.
2516
+ // If set to false, we will use the sequence numbers in the ingested file
2517
+ // as is, and IngestExternalFile() will fail if the ingested key range
2365
2518
  // overlaps with existing keys or tombstones or output of ongoing compaction
2366
- // during file ingestion in the DB (the conditions under which a global_seqno
2367
- // must be assigned to the ingested file).
2519
+ // in the CF (the conditions under which a global seqno must be assigned to
2520
+ // the ingested file).
2521
+ // If the ingested files overlap with each other, we need to assign global
2522
+ // sequence to the ingested files and this option needs to be enabled. One
2523
+ // exception to this is when ingesting DB generated SST files (see option
2524
+ // allow_db_generated_files below). DB generated files do not support
2525
+ // global seqno assignment and can be ingested even if they overlap with
2526
+ // each other. This option has no effect when allow_db_generated_files is
2527
+ // enabled.
2368
2528
  bool allow_global_seqno = true;
2369
2529
  // Normally (true), IngestExternalFile() will trigger and block for flushing
2370
2530
  // memtable(s) if there is overlap between ingested files and memtable(s). If
@@ -2376,8 +2536,8 @@ struct IngestExternalFileOptions {
2376
2536
  // to be skipped rather than overwriting existing data under that key.
2377
2537
  // Use case: back-fill of some historical data in the database without
2378
2538
  // over-writing existing newer version of data.
2379
- // This option could only be used if the DB has been running
2380
- // with allow_ingest_behind=true since the dawn of time.
2539
+ // This option could only be used if the CF has been running
2540
+ // with cf_allow_ingest_behind=true since CF creation (or before any write).
2381
2541
  // All files will be ingested at the bottommost level with seqno=0.
2382
2542
  bool ingest_behind = false;
2383
2543
  // DEPRECATED - Set to true if you would like to write global_seqno to
@@ -2430,18 +2590,53 @@ struct IngestExternalFileOptions {
2430
2590
  //
2431
2591
  // XXX: "bottommost" is obsolete/confusing terminology to refer to last level
2432
2592
  bool fail_if_not_bottommost_level = false;
2433
- // EXPERIMENTAL
2434
- // Enables ingestion of files not generated by SstFileWriter. When true:
2593
+ // EXPERIMENTAL, SUBJECT TO CHANGE
2594
+ //
2595
+ // Enables special mode of ingestion that allows files generated by a live DB,
2596
+ // instead of SstFileWriter. When true:
2435
2597
  // - Allows files to be ingested when their cf_id doesn't match the CF they
2436
2598
  // are being ingested into.
2599
+ // - Allows files with any sequence numbers to be ingested.
2600
+ // - Original sequence numbers are preserved (no reassignment).
2601
+ //
2437
2602
  // REQUIREMENTS:
2438
- // - Ingested files must not overlap with existing keys.
2439
- // - `write_global_seqno` must be false.
2440
- // - All keys in ingested files should have sequence number 0. We fail
2441
- // ingestion if any sequence numbers is non-zero.
2442
- // WARNING: If a DB contains ingested files generated by another DB/CF,
2443
- // RepairDB() may not recover these files correctly, potentially leading to
2444
- // data loss.
2603
+ // - Ingested files must NOT overlap with any existing data in the DB. Since
2604
+ // no sequence number reassignment is performed on db generated files.
2605
+ // Ingestion will fail if any overlap is detected. However, input files
2606
+ // are allowed to overlap with each other when this option is enabled. This
2607
+ // is useful when ingesting multiple levels of files from a CF, where
2608
+ // levels naturally overlap with each other.
2609
+ // - CAUTION: If input files overlap with each other, then for any given user
2610
+ // key appearing in multiple files, earlier files MUST have smaller sequence
2611
+ // numbers than later files. Later files will be placed at a higher level
2612
+ // (smaller level number). This is to ensure the LSM invariant where for
2613
+ // the same key, recent updates are in higher levels. This means that
2614
+ // if you are ingesting files from multiple levels of a CF, you should
2615
+ // put files from lower levels first, and files from higher levels later.
2616
+ // Example for getting files from a CF for ingestion:
2617
+ //
2618
+ // ColumnFamilyMetaData cf_meta;
2619
+ // from_db->GetColumnFamilyMetaData(from_cf, &cf_meta);
2620
+ // // iterate in reverse to start from lowest level
2621
+ // for (auto level_meta = cf_meta.levels.rbegin();
2622
+ // level_meta != cf_meta.levels.rend(); ++level_meta) {
2623
+ // // L0 files need to be added in reverse order so we iterate in reverse
2624
+ // // within a level too
2625
+ // for (auto file_meta = level_meta->files.rbegin();
2626
+ // file_meta != level_meta->files.rend(); ++file_meta) {
2627
+ // // Add file for ingestion
2628
+ // }
2629
+ // }
2630
+ //
2631
+ // WARNING: Violating the sequence number ordering requirement will cause
2632
+ // LSM invariant violations and may lead to incorrect reads or data
2633
+ // corruption.
2634
+ // - If you would like to enforce that the ingested files do not overlap
2635
+ // with each other, you can set `fail_if_not_bottommost_level` to true.
2636
+ // If ingested files overlap with each other, some file will be placed
2637
+ // above Lmax, failing the ingestion if the option is set.
2638
+ // - `write_global_seqno` must be false (sequence numbers cannot be
2639
+ // reassigned).
2445
2640
  bool allow_db_generated_files = false;
2446
2641
 
2447
2642
  // Controls whether data and metadata blocks (e.g. index, filter) read during
@@ -0,0 +1,14 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ //
3
+ // This source code is licensed under both the GPLv2 (found in the
4
+ // COPYING file in the root directory) and Apache 2.0 License
5
+ // (found in the LICENSE.Apache file in the root directory).
6
+
7
+ #pragma once
8
+
9
+ #include "rocksdb/rocksdb_namespace.h"
10
+
11
+ namespace ROCKSDB_NAMESPACE {
12
+
13
+ int point_lock_bench_tool(int argc, char** argv);
14
+ } // namespace ROCKSDB_NAMESPACE
@@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE {
33
33
  // Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to
34
34
  // return true, but (depending on the implementation) IsReady() might never
35
35
  // return true without Wait() or SecondaryCache::WaitAll(). After the handle
36
- // is known ready, calling Value() is required to avoid a memory leak in case
37
- // of a cache hit.
36
+ // is known ready, calling Value() and taking ownership is required to avoid
37
+ // a memory leak in case of a cache hit.
38
38
  class SecondaryCacheResultHandle {
39
39
  public:
40
40
  virtual ~SecondaryCacheResultHandle() = default;
@@ -24,7 +24,7 @@
24
24
  #include <cstdio>
25
25
  #include <cstring>
26
26
  #include <string>
27
- #include <string_view> // RocksDB now requires C++17 support
27
+ #include <string_view>
28
28
 
29
29
  #include "rocksdb/cleanable.h"
30
30
 
@@ -443,10 +443,14 @@ enum Tickers : uint32_t {
443
443
  // Tiered storage related statistics
444
444
  HOT_FILE_READ_BYTES,
445
445
  WARM_FILE_READ_BYTES,
446
+ COOL_FILE_READ_BYTES,
446
447
  COLD_FILE_READ_BYTES,
448
+ ICE_FILE_READ_BYTES,
447
449
  HOT_FILE_READ_COUNT,
448
450
  WARM_FILE_READ_COUNT,
451
+ COOL_FILE_READ_COUNT,
449
452
  COLD_FILE_READ_COUNT,
453
+ ICE_FILE_READ_COUNT,
450
454
 
451
455
  // Last level and non-last level read statistics
452
456
  LAST_LEVEL_READ_BYTES,
@@ -542,6 +546,9 @@ enum Tickers : uint32_t {
542
546
  // TransactionOptions::large_txn_commit_optimize_threshold.
543
547
  NUMBER_WBWI_INGEST,
544
548
 
549
+ // Failure to load the UDI during SST table open
550
+ SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
551
+
545
552
  TICKER_ENUM_MAX
546
553
  };
547
554
 
@@ -115,6 +115,8 @@ class Status {
115
115
  kIOFenced = 14,
116
116
  kMergeOperatorFailed = 15,
117
117
  kMergeOperandThresholdExceeded = 16,
118
+ kPrefetchLimitReached = 17,
119
+ kNotExpectedCodePath = 18,
118
120
  kMaxSubCode
119
121
  };
120
122
 
@@ -318,12 +320,19 @@ class Status {
318
320
 
319
321
  static Status LockLimit() { return Status(kAborted, kLockLimit); }
320
322
 
323
+ static Status PrefetchLimitReached() {
324
+ return Status(kIncomplete, kPrefetchLimitReached);
325
+ }
326
+
321
327
  // Returns true iff the status indicates success.
322
328
  bool ok() const {
323
329
  MarkChecked();
324
330
  return code() == kOk;
325
331
  }
326
332
 
333
+ // Assert the status is OK in debug mode
334
+ void AssertOK() const { assert(ok()); }
335
+
327
336
  // Returns true iff the status indicates success *with* something
328
337
  // overwritten
329
338
  bool IsOkOverwritten() const {
@@ -486,6 +495,13 @@ class Status {
486
495
  return (code() == kIOError) && (subcode() == kIOFenced);
487
496
  }
488
497
 
498
+ // Returns true iff the status indicates prefetch limit reached during
499
+ // MultiScan.
500
+ bool IsPrefetchLimitReached() const {
501
+ MarkChecked();
502
+ return (code() == kIncomplete) && (subcode() == kPrefetchLimitReached);
503
+ }
504
+
489
505
  // Return a string representation of this status suitable for printing.
490
506
  // Returns the string "OK" for success.
491
507
  std::string ToString() const;
@@ -440,10 +440,13 @@ struct BlockBasedTableOptions {
440
440
  // versions of RocksDB able to read partitioned filters are able to read
441
441
  // decoupled partitioned filters.)
442
442
  //
443
- // decouple_partitioned_filters = false is the original behavior, because of
444
- // limitations in the initial implementation, and the new behavior
445
- // decouple_partitioned_filters = true is expected to become the new default.
446
- bool decouple_partitioned_filters = false;
443
+ // decouple_partitioned_filters = true is the new default. This option is now
444
+ // DEPRECATED and might be ignored and/or removed in a future release.
445
+ //
446
+ // NOTE: decouple_partitioned_filters = false with partition_filters = true
447
+ // disables parallel compression (CompressionOptions::parallel_threads
448
+ // sanitized to 1).
449
+ bool decouple_partitioned_filters = true;
447
450
 
448
451
  // Option to generate Bloom/Ribbon filters that minimize memory
449
452
  // internal fragmentation.
@@ -501,8 +504,17 @@ struct BlockBasedTableOptions {
501
504
  // If non-nullptr, use the specified factory to build user-defined index.
502
505
  // This allows users to define their own index format and build the index
503
506
  // during table building.
507
+ //
508
+ // NOTE: UserDefinedIndexFactory currently disables parallel compression
509
+ // (CompressionOptions::parallel_threads sanitized to 1).
504
510
  std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory = nullptr;
505
511
 
512
+ // EXPERIMENTAL
513
+ //
514
+ // Return an error Status if a user_defined_index_factory is configured,
515
+ // but there's no corresponding UDI block in the SST file being opened.
516
+ bool fail_if_no_udi_on_open = false;
517
+
506
518
  // If true, place whole keys in the filter (not just prefixes).
507
519
  // This must generally be true for gets to be efficient.
508
520
  bool whole_key_filtering = true;
@@ -76,6 +76,7 @@ struct TablePropertiesNames {
76
76
  static const std::string kTailStartOffset;
77
77
  static const std::string kUserDefinedTimestampsPersisted;
78
78
  static const std::string kKeyLargestSeqno;
79
+ static const std::string kKeySmallestSeqno;
79
80
  };
80
81
 
81
82
  // `TablePropertiesCollector` provides the mechanism for users to collect
@@ -220,6 +221,8 @@ struct TableProperties {
220
221
  uint64_t orig_file_number = 0;
221
222
  // the total size of all data blocks.
222
223
  uint64_t data_size = 0;
224
+ // the total uncompressed size of all data blocks (since RocksDB 10.7)
225
+ uint64_t uncompressed_data_size = 0;
223
226
  // the size of index block.
224
227
  uint64_t index_size = 0;
225
228
  // Total number of index partitions if kTwoLevelIndexSearch is used
@@ -307,6 +310,16 @@ struct TableProperties {
307
310
  // table is empty).
308
311
  uint64_t key_largest_seqno = UINT64_MAX;
309
312
 
313
+ bool HasKeyLargestSeqno() const { return key_largest_seqno != UINT64_MAX; }
314
+
315
+ // The smallest sequence number of keys in this file.
316
+ // UINT64_MAX means unknown.
317
+ // Only written to properties block if known (should be known unless the
318
+ // table is empty).
319
+ uint64_t key_smallest_seqno = UINT64_MAX;
320
+
321
+ bool HasKeySmallestSeqno() const { return key_smallest_seqno != UINT64_MAX; }
322
+
310
323
  // DB identity
311
324
  // db_id is an identifier generated the first time the DB is created
312
325
  // If DB identity is unset or unassigned, `db_id` will be an empty string.
@@ -118,7 +118,11 @@ enum class Temperature : uint8_t {
118
118
  kUnknown = 0,
119
119
  kHot = 0x04,
120
120
  kWarm = 0x08,
121
+ kCool = 0x0A,
121
122
  kCold = 0x0C,
123
+ kIce = 0x10,
124
+ // XXX: this is mis-named. It is instead an invalid temperature beyond the
125
+ // rest
122
126
  kLastTemperature,
123
127
  };
124
128
 
@@ -144,9 +144,7 @@ class CompactionOptionsUniversal {
144
144
  incremental(false),
145
145
  reduce_file_locking(false) {}
146
146
 
147
- #if __cplusplus >= 202002L
148
147
  bool operator==(const CompactionOptionsUniversal& rhs) const = default;
149
- #endif
150
148
  };
151
149
 
152
150
  } // namespace ROCKSDB_NAMESPACE
@@ -27,6 +27,10 @@ inline const std::string kUserDefinedIndexPrefix =
27
27
  // It allows users to define their own index format and build custom
28
28
  // indexes during table building. Currently, only a monolithic index
29
29
  // block is supported (no partitioned index).
30
+ //
31
+ // This is currently supported only for a restricted set of use cases. The
32
+ // CF must be ingest only, and only files containing Puts generated by
33
+ // SstFileWriter are supported.
30
34
 
31
35
  // The interface for building user-defined index.
32
36
  class UserDefinedIndexBuilder {
@@ -51,6 +55,10 @@ class UserDefinedIndexBuilder {
51
55
  // The previous index entry key and the new index entry key cover
52
56
  // all the keys in the data block associated with the new index entry.
53
57
  //
58
+ // The last_key_in_current_block and first_key_in_next_block will be user
59
+ // keys, i.e the user key string, and optionally the user timestamp if one
60
+ // is configured, without a sequence number suffix.
61
+ //
54
62
  // Called before the OnKeyAdded() call for first_key_in_next_block.
55
63
  // @last_key_in_current_block: The last key in the current data block
56
64
  // @first_key_in_next_block: it will be nullptr if the entry being added is
@@ -72,6 +80,9 @@ class UserDefinedIndexBuilder {
72
80
  // override OnKeyAdded() if they need to collect additional information.
73
81
  // The type argument indicates whether the value is a full value or partial.
74
82
  // At the moment, only full values are supported.
83
+ //
84
+ // The key will be a user key. RocksDB guarantees that there will only be
85
+ // one entry for each key in the file/index.
75
86
  virtual void OnKeyAdded(const Slice& /*key*/, ValueType /*type*/,
76
87
  const Slice& /*value*/) {}
77
88
 
@@ -100,6 +111,14 @@ class UserDefinedIndexIterator {
100
111
  // termination criteria, kInbound if the data block is definitely fully
101
112
  // within bounds, or kUnknown if the data block could be partially
102
113
  // within bounds.
114
+ // The UDI implementation needs to be careful about returning kOutOfBound.
115
+ // If a limit key is specified in ScanOptions, an implementation that
116
+ // does not store the first key in the block for the corresponding index
117
+ // entry cannot reliably determine if the block is out of bounds. It must
118
+ // compare against the previous index key to determine if the current block
119
+ // is out of bounds w.r.t the limit. Other termination criteria (specified
120
+ // in property_bag) may cause the scan to terminate earlier, in which case
121
+ // kOutOfBound can be returned earlier.
103
122
  virtual Status SeekAndGetResult(const Slice& target,
104
123
  IterateResult* result) = 0;
105
124
 
@@ -125,11 +144,22 @@ class UserDefinedIndexReader {
125
144
  virtual size_t ApproximateMemoryUsage() const = 0;
126
145
  };
127
146
 
147
+ // Options for user defined index
148
+ struct UserDefinedIndexOption {
149
+ const Comparator* comparator = BytewiseComparator();
150
+ };
151
+
128
152
  // Factory for creating user-defined index builders.
129
153
  class UserDefinedIndexFactory : public Customizable {
130
154
  public:
131
155
  virtual ~UserDefinedIndexFactory() = default;
132
156
 
157
+ static const char* Type() { return "UserDefinedIndexFactory"; }
158
+
159
+ static Status CreateFromString(
160
+ const ConfigOptions& config_options, const std::string& value,
161
+ std::shared_ptr<UserDefinedIndexFactory>* factory);
162
+
133
163
  // Create a new builder for user-defined index.
134
164
  virtual UserDefinedIndexBuilder* NewBuilder() const = 0;
135
165
 
@@ -137,6 +167,21 @@ class UserDefinedIndexFactory : public Customizable {
137
167
  // block
138
168
  virtual std::unique_ptr<UserDefinedIndexReader> NewReader(
139
169
  Slice& index_block) const = 0;
170
+
171
+ // New API for allowing customized comparator
172
+ virtual Status NewBuilder(
173
+ const UserDefinedIndexOption& /*option*/,
174
+ std::unique_ptr<UserDefinedIndexBuilder>& builder) const {
175
+ builder.reset(NewBuilder());
176
+ return Status::OK();
177
+ };
178
+
179
+ virtual Status NewReader(
180
+ const UserDefinedIndexOption& /*option*/, Slice& index_block,
181
+ std::unique_ptr<UserDefinedIndexReader>& reader) const {
182
+ reader = NewReader(index_block);
183
+ return Status::OK();
184
+ };
140
185
  };
141
186
 
142
187
  } // namespace ROCKSDB_NAMESPACE
@@ -90,7 +90,7 @@ class CacheDumper {
90
90
  public:
91
91
  virtual ~CacheDumper() = default;
92
92
  // Only dump the blocks in the block cache that belong to the DBs in this list
93
- virtual Status SetDumpFilter(std::vector<DB*> db_list) {
93
+ virtual Status SetDumpFilter(const std::vector<DB*>& db_list) {
94
94
  (void)db_list;
95
95
  return Status::NotSupported("SetDumpFilter is not supported");
96
96
  }
@@ -292,7 +292,7 @@ class StackableDB : public DB {
292
292
  using DB::NewMultiScan;
293
293
  std::unique_ptr<MultiScan> NewMultiScan(
294
294
  const ReadOptions& opts, ColumnFamilyHandle* column_family,
295
- const std::vector<ScanOptions>& scan_opts) override {
295
+ const MultiScanArgs& scan_opts) override {
296
296
  return db_->NewMultiScan(opts, column_family, scan_opts);
297
297
  }
298
298
 
@@ -653,7 +653,12 @@ class Transaction {
653
653
  // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
654
654
  // this transaction.
655
655
  // Has no effect on OptimisticTransactions.
656
- virtual void SetLockTimeout(int64_t timeout) = 0;
656
+ virtual void SetLockTimeout(int64_t timeout_ms) = 0;
657
+
658
+ // Change the value of deadlock_timeout (in milliseconds) for this
659
+ // transaction.
660
+ // Has no effect on OptimisticTransactions.
661
+ virtual void SetDeadlockTimeout(int64_t timeout_ms) = 0;
657
662
 
658
663
  // Return the WriteOptions that will be used during Commit()
659
664
  virtual WriteOptions* GetWriteOptions() = 0;
@@ -217,6 +217,11 @@ struct TransactionDBOptions {
217
217
  // Other value means the user provides a custom lock manager.
218
218
  std::shared_ptr<LockManagerHandle> lock_mgr_handle;
219
219
 
220
+ // EXPERIMENTAL
221
+ //
222
+ // Flag to enable/disable the per key point lock manager.
223
+ bool use_per_key_point_lock_mgr = false;
224
+
220
225
  // If true, the TransactionDB implementation might skip concurrency control
221
226
  // unless it is overridden by TransactionOptions or
222
227
  // TransactionDBWriteOptimizations. This can be used in conjunction with
@@ -319,6 +324,22 @@ struct TransactionOptions {
319
324
  // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
320
325
  int64_t lock_timeout = -1;
321
326
 
327
+ // Timeout in microseconds before perform dead lock detection.
328
+ // If 0, deadlock detection will be performed immediately.
329
+ //
330
+ // To optimize performance, this parameter could be tuned.
331
+ //
332
+ // When deadlock happens very frequently, deadlock timeout should be set to 0,
333
+ // so deadlock will be detected immediately.
334
+ //
335
+ // When deadlock happen very rarely, this timeout could be turned to be
336
+ // slightly longer than the typical transaction execution time, so that
337
+ // transaction will be waked up to take the lock before this timeout, which
338
+ // will allow the transaction to save the CPU time on deadlock detection.
339
+ //
340
+ // Deadlock timeout is always smaller than lock_timeout.
341
+ int64_t deadlock_timeout_us = 500;
342
+
322
343
  // Expiration duration in milliseconds. If non-negative, transactions that
323
344
  // last longer than this many milliseconds will fail to commit. If not set,
324
345
  // a forgotten transaction that is never committed, rolled back, or deleted