@nxtedition/rocksdb 8.2.0 → 8.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. package/binding.cc +3 -3
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +16 -52
  3. package/deps/rocksdb/rocksdb/Makefile +10 -5
  4. package/deps/rocksdb/rocksdb/TARGETS +8 -345
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +92 -0
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +32 -32
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +12 -9
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +6 -43
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +3 -13
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +8 -5
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +21 -47
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +3 -8
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +1 -2
  15. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +44 -7
  16. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +13 -14
  17. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -0
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +2 -1
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +17 -8
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +40 -21
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +41 -42
  25. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +1 -1
  26. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +1 -1
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +5 -4
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +2 -2
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +5 -3
  30. package/deps/rocksdb/rocksdb/db/builder.cc +7 -6
  31. package/deps/rocksdb/rocksdb/db/builder.h +2 -2
  32. package/deps/rocksdb/rocksdb/db/c.cc +76 -5
  33. package/deps/rocksdb/rocksdb/db/c_test.c +141 -0
  34. package/deps/rocksdb/rocksdb/db/column_family.cc +32 -0
  35. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +3 -2
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +5 -0
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +8 -5
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +12 -10
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +21 -17
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -7
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +3 -1
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +77 -50
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +4 -5
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +55 -8
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +142 -56
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +1 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +1 -2
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +21 -20
  51. package/deps/rocksdb/rocksdb/db/convenience.cc +8 -6
  52. package/deps/rocksdb/rocksdb/db/corruption_test.cc +5 -4
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +6 -3
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +260 -220
  55. package/deps/rocksdb/rocksdb/db/db_clip_test.cc +142 -0
  56. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +1 -1
  57. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +333 -27
  58. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +5 -0
  59. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +7 -0
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +189 -27
  61. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +23 -10
  62. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +134 -90
  63. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +2 -2
  64. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -3
  65. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -1
  66. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +124 -16
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +10 -0
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +7 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +15 -0
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +11 -5
  71. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -8
  72. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +54 -3
  73. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +42 -0
  74. package/deps/rocksdb/rocksdb/db/db_options_test.cc +116 -1
  75. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -2
  76. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +3 -2
  77. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +9 -8
  78. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +142 -63
  79. package/deps/rocksdb/rocksdb/db/db_test.cc +28 -7
  80. package/deps/rocksdb/rocksdb/db/db_test2.cc +71 -131
  81. package/deps/rocksdb/rocksdb/db/db_test_util.cc +18 -0
  82. package/deps/rocksdb/rocksdb/db/db_test_util.h +6 -0
  83. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +10 -10
  84. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +25 -0
  85. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +88 -0
  86. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +67 -0
  87. package/deps/rocksdb/rocksdb/db/db_write_test.cc +5 -0
  88. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/experimental.cc +4 -2
  90. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +86 -1
  91. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +15 -2
  92. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -2
  93. package/deps/rocksdb/rocksdb/db/flush_job.cc +21 -14
  94. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -7
  95. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +31 -8
  96. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +21 -19
  97. package/deps/rocksdb/rocksdb/db/internal_stats.cc +42 -12
  98. package/deps/rocksdb/rocksdb/db/internal_stats.h +1 -0
  99. package/deps/rocksdb/rocksdb/db/kv_checksum.h +92 -6
  100. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -2
  101. package/deps/rocksdb/rocksdb/db/log_format.h +8 -4
  102. package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -51
  103. package/deps/rocksdb/rocksdb/db/log_reader.h +16 -0
  104. package/deps/rocksdb/rocksdb/db/log_test.cc +125 -4
  105. package/deps/rocksdb/rocksdb/db/log_writer.cc +32 -2
  106. package/deps/rocksdb/rocksdb/db/log_writer.h +16 -0
  107. package/deps/rocksdb/rocksdb/db/memtable.cc +17 -46
  108. package/deps/rocksdb/rocksdb/db/memtable.h +1 -1
  109. package/deps/rocksdb/rocksdb/db/memtable_list.cc +8 -4
  110. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -1
  111. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +5 -4
  113. package/deps/rocksdb/rocksdb/db/repair.cc +38 -11
  114. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +3 -3
  115. package/deps/rocksdb/rocksdb/db/table_cache.cc +68 -51
  116. package/deps/rocksdb/rocksdb/db/table_cache.h +20 -10
  117. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -1
  118. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +6 -3
  119. package/deps/rocksdb/rocksdb/db/version_builder.cc +9 -5
  120. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  121. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +140 -120
  122. package/deps/rocksdb/rocksdb/db/version_edit.cc +14 -0
  123. package/deps/rocksdb/rocksdb/db/version_edit.h +12 -4
  124. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +21 -13
  125. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +26 -16
  126. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +9 -9
  127. package/deps/rocksdb/rocksdb/db/version_set.cc +292 -96
  128. package/deps/rocksdb/rocksdb/db/version_set.h +53 -28
  129. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -0
  130. package/deps/rocksdb/rocksdb/db/version_set_test.cc +62 -22
  131. package/deps/rocksdb/rocksdb/db/version_util.h +5 -4
  132. package/deps/rocksdb/rocksdb/db/write_batch.cc +3 -1
  133. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  134. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +119 -27
  135. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +123 -0
  136. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  137. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +7 -2
  138. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +34 -0
  139. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -0
  140. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +43 -33
  141. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +29 -17
  142. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  143. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -1
  144. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +85 -50
  145. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +96 -54
  146. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.cc +122 -0
  147. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +206 -0
  148. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +9 -1
  149. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +9 -3
  150. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +322 -92
  151. package/deps/rocksdb/rocksdb/env/env_posix.cc +12 -8
  152. package/deps/rocksdb/rocksdb/env/env_test.cc +31 -0
  153. package/deps/rocksdb/rocksdb/env/mock_env.cc +1 -1
  154. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +14 -0
  155. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +1 -1
  156. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +5 -1
  157. package/deps/rocksdb/rocksdb/file/file_util.cc +3 -3
  158. package/deps/rocksdb/rocksdb/file/file_util.h +2 -0
  159. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +89 -0
  160. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +22 -7
  161. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -2
  162. package/deps/rocksdb/rocksdb/file/readahead_raf.cc +1 -1
  163. package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +1 -1
  164. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +1 -1
  165. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +3 -0
  166. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +154 -74
  167. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +27 -7
  168. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +107 -28
  169. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +19 -0
  170. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +8 -0
  171. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +2 -0
  172. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +7 -1
  173. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +137 -152
  174. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +61 -26
  175. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +30 -26
  176. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +33 -16
  177. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +87 -8
  178. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
  180. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  181. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +1 -0
  182. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -0
  183. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -1
  184. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  185. package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +9 -2
  186. package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -0
  187. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +78 -42
  188. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +14 -9
  189. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +1 -0
  190. package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -0
  191. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +4 -9
  192. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +19 -11
  193. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +1 -1
  194. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +211 -555
  195. package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +1 -1
  196. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +36 -2
  197. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +17 -7
  198. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.h +10 -7
  199. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +19 -18
  200. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.h +10 -2
  201. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +14 -0
  202. package/deps/rocksdb/rocksdb/options/cf_options.cc +35 -2
  203. package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
  204. package/deps/rocksdb/rocksdb/options/customizable_test.cc +1 -1
  205. package/deps/rocksdb/rocksdb/options/options.cc +12 -53
  206. package/deps/rocksdb/rocksdb/options/options_helper.cc +4 -0
  207. package/deps/rocksdb/rocksdb/options/options_parser.cc +11 -0
  208. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +32 -4
  209. package/deps/rocksdb/rocksdb/options/options_test.cc +89 -5
  210. package/deps/rocksdb/rocksdb/port/lang.h +27 -0
  211. package/deps/rocksdb/rocksdb/port/stack_trace.cc +67 -24
  212. package/deps/rocksdb/rocksdb/src.mk +2 -0
  213. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -3
  214. package/deps/rocksdb/rocksdb/table/block_based/block.cc +195 -35
  215. package/deps/rocksdb/rocksdb/table/block_based/block.h +197 -24
  216. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +71 -51
  217. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +7 -1
  218. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +4 -6
  219. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +3 -0
  220. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +43 -2
  221. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +36 -6
  222. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +266 -166
  223. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +44 -14
  224. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -1
  225. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +63 -56
  226. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +8 -2
  227. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +4 -2
  228. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +10 -0
  229. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +14 -2
  230. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +918 -2
  231. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -2
  232. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +10 -9
  233. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +6 -8
  234. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +2 -2
  235. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +1 -1
  236. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +18 -23
  237. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +8 -8
  238. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +16 -32
  239. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +7 -8
  240. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +4 -5
  241. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +3 -3
  242. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +46 -53
  243. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +12 -12
  244. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +7 -9
  245. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +26 -23
  246. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +2 -1
  247. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +3 -0
  248. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -2
  249. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +3 -2
  250. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +7 -1
  251. package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
  252. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  253. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +3 -2
  254. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +5 -2
  255. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +4 -2
  256. package/deps/rocksdb/rocksdb/table/format.cc +4 -4
  257. package/deps/rocksdb/rocksdb/table/format.h +1 -1
  258. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -1
  259. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +33 -22
  260. package/deps/rocksdb/rocksdb/table/meta_blocks.h +4 -0
  261. package/deps/rocksdb/rocksdb/table/mock_table.cc +4 -2
  262. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +1 -1
  264. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +18 -10
  265. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -3
  266. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +10 -7
  267. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +4 -2
  268. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +11 -0
  269. package/deps/rocksdb/rocksdb/table/table_builder.h +14 -5
  270. package/deps/rocksdb/rocksdb/table/table_properties.cc +2 -0
  271. package/deps/rocksdb/rocksdb/table/table_reader.h +6 -3
  272. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +1 -1
  273. package/deps/rocksdb/rocksdb/table/table_test.cc +291 -34
  274. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +3 -1
  275. package/deps/rocksdb/rocksdb/test_util/testharness.h +5 -0
  276. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -2
  277. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +33 -17
  278. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +3 -1
  279. package/deps/rocksdb/rocksdb/util/bloom_impl.h +2 -2
  280. package/deps/rocksdb/rocksdb/util/compression.h +1 -1
  281. package/deps/rocksdb/rocksdb/util/crc32c.cc +24 -83
  282. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +7 -9
  283. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +4 -1
  284. package/deps/rocksdb/rocksdb/util/filter_bench.cc +1 -1
  285. package/deps/rocksdb/rocksdb/util/gflags_compat.h +9 -10
  286. package/deps/rocksdb/rocksdb/util/math.h +12 -7
  287. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +16 -18
  288. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +46 -2
  289. package/deps/rocksdb/rocksdb/util/ribbon_test.cc +6 -6
  290. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +12 -7
  291. package/deps/rocksdb/rocksdb/util/stop_watch.h +31 -13
  292. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +2 -0
  293. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
  294. package/deps/rocksdb/rocksdb/util/udt_util.h +77 -0
  295. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +2 -2
  296. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +1 -1
  297. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +1 -1
  298. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +1 -1
  299. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +1 -1
  300. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -1
  301. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +11 -1
  302. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +34 -1
  303. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +15 -0
  304. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +1 -1
  305. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +5 -1
  306. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +29 -1
  307. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +0 -1
  308. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -1
  309. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +6 -1
  310. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +10 -0
  311. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +6 -1
  312. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +5 -0
  313. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -0
  314. package/package.json +1 -1
  315. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  316. package/prebuilds/linux-x64/node.napi.node +0 -0
  317. /package/deps/rocksdb/rocksdb/memory/{memory_allocator.h → memory_allocator_impl.h} +0 -0
  318. /package/deps/rocksdb/rocksdb/monitoring/{statistics.h → statistics_impl.h} +0 -0
  319. /package/deps/rocksdb/rocksdb/table/block_based/{flush_block_policy.h → flush_block_policy_impl.h} +0 -0
  320. /package/deps/rocksdb/rocksdb/util/{rate_limiter.h → rate_limiter_impl.h} +0 -0
  321. /package/deps/rocksdb/rocksdb/utilities/agg_merge/{agg_merge.h → agg_merge_impl.h} +0 -0
@@ -55,6 +55,11 @@ struct JemallocAllocatorOptions {
55
55
  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
56
56
  // When used with block cache, it is recommended to set it to block_size.
57
57
  size_t tcache_size_upper_bound = 16 * 1024;
58
+
59
+ // Number of arenas across which we spread allocation requests. Increasing
60
+ // this setting can mitigate arena mutex contention. The value must be
61
+ // positive.
62
+ size_t num_arenas = 1;
58
63
  };
59
64
 
60
65
  // Generate memory allocator which allocates through Jemalloc and utilize
@@ -70,7 +75,8 @@ struct JemallocAllocatorOptions {
70
75
  // core dump. Side benefit of using single arena would be reduction of jemalloc
71
76
  // metadata for some workloads.
72
77
  //
73
- // To mitigate mutex contention for using one single arena, jemalloc tcache
78
+ // To mitigate mutex contention for using one single arena (see also
79
+ // `JemallocAllocatorOptions::num_arenas` above), jemalloc tcache
74
80
  // (thread-local cache) is enabled to cache unused allocations for future use.
75
81
  // The tcache normally incurs 0.5M extra memory usage per-thread. The usage
76
82
  // can be reduced by limiting allocation sizes to cache.
@@ -1157,7 +1157,7 @@ struct DBOptions {
1157
1157
 
1158
1158
  // A global cache for table-level rows.
1159
1159
  // Default: nullptr (disabled)
1160
- std::shared_ptr<Cache> row_cache = nullptr;
1160
+ std::shared_ptr<GeneralCache> row_cache = nullptr;
1161
1161
 
1162
1162
  // A filter object supplied to be invoked while processing write-ahead-logs
1163
1163
  // (WALs) during recovery. The filter provides a way to inspect log
@@ -1457,12 +1457,119 @@ enum ReadTier {
1457
1457
 
1458
1458
  // Options that control read operations
1459
1459
  struct ReadOptions {
1460
+ // *** BEGIN options relevant to point lookups as well as scans ***
1461
+
1460
1462
  // If "snapshot" is non-nullptr, read as of the supplied snapshot
1461
1463
  // (which must belong to the DB that is being read and which must
1462
1464
  // not have been released). If "snapshot" is nullptr, use an implicit
1463
1465
  // snapshot of the state at the beginning of this read operation.
1464
- // Default: nullptr
1465
- const Snapshot* snapshot;
1466
+ const Snapshot* snapshot = nullptr;
1467
+
1468
+ // Timestamp of operation. Read should return the latest data visible to the
1469
+ // specified timestamp. All timestamps of the same database must be of the
1470
+ // same length and format. The user is responsible for providing a customized
1471
+ // compare function via Comparator to order <key, timestamp> tuples.
1472
+ // For iterator, iter_start_ts is the lower bound (older) and timestamp
1473
+ // serves as the upper bound. Versions of the same record that fall in
1474
+ // the timestamp range will be returned. If iter_start_ts is nullptr,
1475
+ // only the most recent version visible to timestamp is returned.
1476
+ // The user-specified timestamp feature is still under active development,
1477
+ // and the API is subject to change.
1478
+ const Slice* timestamp = nullptr;
1479
+ const Slice* iter_start_ts = nullptr;
1480
+
1481
+ // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
1482
+ // in microseconds.
1483
+ // It should be set to microseconds since epoch, i.e, gettimeofday or
1484
+ // equivalent plus allowed duration in microseconds. The best way is to use
1485
+ // env->NowMicros() + some timeout.
1486
+ // This is best efforts. The call may exceed the deadline if there is IO
1487
+ // involved and the file system doesn't support deadlines, or due to
1488
+ // checking for deadline periodically rather than for every key if
1489
+ // processing a batch
1490
+ std::chrono::microseconds deadline = std::chrono::microseconds::zero();
1491
+
1492
+ // A timeout in microseconds to be passed to the underlying FileSystem for
1493
+ // reads. As opposed to deadline, this determines the timeout for each
1494
+ // individual file read request. If a MultiGet/Get/Seek/Next etc call
1495
+ // results in multiple reads, each read can last up to io_timeout us.
1496
+ std::chrono::microseconds io_timeout = std::chrono::microseconds::zero();
1497
+
1498
+ // Specify if this read request should process data that ALREADY
1499
+ // resides on a particular cache. If the required data is not
1500
+ // found at the specified cache, then Status::Incomplete is returned.
1501
+ ReadTier read_tier = kReadAllTier;
1502
+
1503
+ // For file reads associated with this option, charge the internal rate
1504
+ // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
1505
+ // special value `Env::IO_TOTAL` disables charging the rate limiter.
1506
+ //
1507
+ // The rate limiting is bypassed no matter this option's value for file reads
1508
+ // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
1509
+ // is a `PlainTableFactory`) and cuckoo tables (these can exist when
1510
+ // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
1511
+ //
1512
+ // The bytes charged to rate limiter may not exactly match the file read bytes
1513
+ // since there are some seemingly insignificant reads, like for file
1514
+ // headers/footers, that we currently do not charge to rate limiter.
1515
+ Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
1516
+
1517
+ // It limits the maximum cumulative value size of the keys in batch while
1518
+ // reading through MultiGet. Once the cumulative value size exceeds this
1519
+ // soft limit then all the remaining keys are returned with status Aborted.
1520
+ uint64_t value_size_soft_limit = std::numeric_limits<uint64_t>::max();
1521
+
1522
+ // If true, all data read from underlying storage will be
1523
+ // verified against corresponding checksums.
1524
+ bool verify_checksums = true;
1525
+
1526
+ // Should the "data block"/"index block" read for this iteration be placed in
1527
+ // block cache?
1528
+ // Callers may wish to set this field to false for bulk scans.
1529
+ // This would help not to the change eviction order of existing items in the
1530
+ // block cache.
1531
+ bool fill_cache = true;
1532
+
1533
+ // If true, range tombstones handling will be skipped in key lookup paths.
1534
+ // For DB instances that don't use DeleteRange() calls, this setting can
1535
+ // be used to optimize the read performance.
1536
+ // Note that, if this assumption (of no previous DeleteRange() calls) is
1537
+ // broken, stale keys could be served in read paths.
1538
+ bool ignore_range_deletions = false;
1539
+
1540
+ // Experimental
1541
+ //
1542
+ // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
1543
+ // RocksDB apply it if reads are sequential and its internal automatic
1544
+ // prefetching.
1545
+ bool async_io = false;
1546
+
1547
+ // Experimental
1548
+ //
1549
+ // If async_io is set, then this flag controls whether we read SST files
1550
+ // in multiple levels asynchronously. Enabling this flag can help reduce
1551
+ // MultiGet latency by maximizing the number of SST files read in
1552
+ // parallel if the keys in the MultiGet batch are in different levels. It
1553
+ // comes at the expense of slightly higher CPU overhead.
1554
+ bool optimize_multiget_for_io = true;
1555
+
1556
+ // *** END options relevant to point lookups (as well as scans) ***
1557
+ // *** BEGIN options only relevant to iterators or scans ***
1558
+
1559
+ // RocksDB does auto-readahead for iterators on noticing more than two reads
1560
+ // for a table file. The readahead starts at 8KB and doubles on every
1561
+ // additional read up to 256KB.
1562
+ // This option can help if most of the range scans are large, and if it is
1563
+ // determined that a larger readahead than that enabled by auto-readahead is
1564
+ // needed.
1565
+ // Using a large readahead size (> 2MB) can typically improve the performance
1566
+ // of forward iteration on spinning disks.
1567
+ size_t readahead_size = 0;
1568
+
1569
+ // A threshold for the number of keys that can be skipped before failing an
1570
+ // iterator seek as incomplete. The default value of 0 should be used to
1571
+ // never fail a request as incomplete, even on skipping too many keys.
1572
+ uint64_t max_skippable_internal_keys = 0;
1466
1573
 
1467
1574
  // `iterate_lower_bound` defines the smallest key at which the backward
1468
1575
  // iterator can return an entry. Once the bound is passed, Valid() will be
@@ -1475,8 +1582,7 @@ struct ReadOptions {
1475
1582
  //
1476
1583
  // In case of user_defined timestamp, if enabled, iterate_lower_bound should
1477
1584
  // point to key without timestamp part.
1478
- // Default: nullptr
1479
- const Slice* iterate_lower_bound;
1585
+ const Slice* iterate_lower_bound = nullptr;
1480
1586
 
1481
1587
  // "iterate_upper_bound" defines the extent up to which the forward iterator
1482
1588
  // can return entries. Once the bound is reached, Valid() will be false.
@@ -1496,63 +1602,24 @@ struct ReadOptions {
1496
1602
  //
1497
1603
  // In case of user_defined timestamp, if enabled, iterate_upper_bound should
1498
1604
  // point to key without timestamp part.
1499
- // Default: nullptr
1500
- const Slice* iterate_upper_bound;
1501
-
1502
- // RocksDB does auto-readahead for iterators on noticing more than two reads
1503
- // for a table file. The readahead starts at 8KB and doubles on every
1504
- // additional read up to 256KB.
1505
- // This option can help if most of the range scans are large, and if it is
1506
- // determined that a larger readahead than that enabled by auto-readahead is
1507
- // needed.
1508
- // Using a large readahead size (> 2MB) can typically improve the performance
1509
- // of forward iteration on spinning disks.
1510
- // Default: 0
1511
- size_t readahead_size;
1512
-
1513
- // A threshold for the number of keys that can be skipped before failing an
1514
- // iterator seek as incomplete. The default value of 0 should be used to
1515
- // never fail a request as incomplete, even on skipping too many keys.
1516
- // Default: 0
1517
- uint64_t max_skippable_internal_keys;
1518
-
1519
- // Specify if this read request should process data that ALREADY
1520
- // resides on a particular cache. If the required data is not
1521
- // found at the specified cache, then Status::Incomplete is returned.
1522
- // Default: kReadAllTier
1523
- ReadTier read_tier;
1524
-
1525
- // If true, all data read from underlying storage will be
1526
- // verified against corresponding checksums.
1527
- // Default: true
1528
- bool verify_checksums;
1529
-
1530
- // Should the "data block"/"index block" read for this iteration be placed in
1531
- // block cache?
1532
- // Callers may wish to set this field to false for bulk scans.
1533
- // This would help not to the change eviction order of existing items in the
1534
- // block cache.
1535
- // Default: true
1536
- bool fill_cache;
1605
+ const Slice* iterate_upper_bound = nullptr;
1537
1606
 
1538
1607
  // Specify to create a tailing iterator -- a special iterator that has a
1539
1608
  // view of the complete database (i.e. it can also be used to read newly
1540
1609
  // added data) and is optimized for sequential reads. It will return records
1541
1610
  // that were inserted into the database after the creation of the iterator.
1542
- // Default: false
1543
- bool tailing;
1611
+ bool tailing = false;
1544
1612
 
1545
1613
  // This options is not used anymore. It was to turn on a functionality that
1546
- // has been removed.
1547
- bool managed;
1614
+ // has been removed. DEPRECATED
1615
+ bool managed = false;
1548
1616
 
1549
1617
  // Enable a total order seek regardless of index format (e.g. hash index)
1550
1618
  // used in the table. Some table format (e.g. plain table) may not support
1551
1619
  // this option.
1552
1620
  // If true when calling Get(), we also skip prefix bloom when reading from
1553
1621
  // block based table, which only affects Get() performance.
1554
- // Default: false
1555
- bool total_order_seek;
1622
+ bool total_order_seek = false;
1556
1623
 
1557
1624
  // When true, by default use total_order_seek = true, and RocksDB can
1558
1625
  // selectively enable prefix seek mode if won't generate a different result
@@ -1568,38 +1635,37 @@ struct ReadOptions {
1568
1635
  // iterators. (We are also assuming the new condition on
1569
1636
  // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
1570
1637
  // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
1571
- // Default: false
1572
- bool auto_prefix_mode;
1638
+ bool auto_prefix_mode = false;
1573
1639
 
1574
1640
  // Enforce that the iterator only iterates over the same prefix as the seek.
1575
1641
  // This option is effective only for prefix seeks, i.e. prefix_extractor is
1576
1642
  // non-null for the column family and total_order_seek is false. Unlike
1577
1643
  // iterate_upper_bound, prefix_same_as_start only works within a prefix
1578
1644
  // but in both directions.
1579
- // Default: false
1580
- bool prefix_same_as_start;
1645
+ bool prefix_same_as_start = false;
1581
1646
 
1582
1647
  // Keep the blocks loaded by the iterator pinned in memory as long as the
1583
1648
  // iterator is not deleted, If used when reading from tables created with
1584
1649
  // BlockBasedTableOptions::use_delta_encoding = false,
1585
1650
  // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
1586
1651
  // return 1.
1587
- // Default: false
1588
- bool pin_data;
1652
+ bool pin_data = false;
1653
+
1654
+ // For iterators, RocksDB does auto-readahead on noticing more than two
1655
+ // sequential reads for a table file if user doesn't provide readahead_size.
1656
+ // The readahead starts at 8KB and doubles on every additional read upto
1657
+ // max_auto_readahead_size only when reads are sequential. However at each
1658
+ // level, if iterator moves over next file, readahead_size starts again from
1659
+ // 8KB.
1660
+ //
1661
+ // By enabling this option, RocksDB will do some enhancements for
1662
+ // prefetching the data.
1663
+ bool adaptive_readahead = false;
1589
1664
 
1590
1665
  // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
1591
1666
  // schedule a background job in the flush job queue and delete obsolete files
1592
1667
  // in background.
1593
- // Default: false
1594
- bool background_purge_on_iterator_cleanup;
1595
-
1596
- // If true, range tombstones handling will be skipped in key lookup paths.
1597
- // For DB instances that don't use DeleteRange() calls, this setting can
1598
- // be used to optimize the read performance.
1599
- // Note that, if this assumption (of no previous DeleteRange() calls) is
1600
- // broken, stale keys could be served in read paths.
1601
- // Default: false
1602
- bool ignore_range_deletions;
1668
+ bool background_purge_on_iterator_cleanup = false;
1603
1669
 
1604
1670
  // A callback to determine whether relevant keys for this scan exist in a
1605
1671
  // given table based on the table's properties. The callback is passed the
@@ -1609,95 +1675,14 @@ struct ReadOptions {
1609
1675
  // Default: empty (every table will be scanned)
1610
1676
  std::function<bool(const TableProperties&)> table_filter;
1611
1677
 
1612
- // Timestamp of operation. Read should return the latest data visible to the
1613
- // specified timestamp. All timestamps of the same database must be of the
1614
- // same length and format. The user is responsible for providing a customized
1615
- // compare function via Comparator to order <key, timestamp> tuples.
1616
- // For iterator, iter_start_ts is the lower bound (older) and timestamp
1617
- // serves as the upper bound. Versions of the same record that fall in
1618
- // the timestamp range will be returned. If iter_start_ts is nullptr,
1619
- // only the most recent version visible to timestamp is returned.
1620
- // The user-specified timestamp feature is still under active development,
1621
- // and the API is subject to change.
1622
- // Default: nullptr
1623
- const Slice* timestamp;
1624
- const Slice* iter_start_ts;
1678
+ // *** END options only relevant to iterators or scans ***
1625
1679
 
1626
- // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
1627
- // in microseconds.
1628
- // It should be set to microseconds since epoch, i.e, gettimeofday or
1629
- // equivalent plus allowed duration in microseconds. The best way is to use
1630
- // env->NowMicros() + some timeout.
1631
- // This is best efforts. The call may exceed the deadline if there is IO
1632
- // involved and the file system doesn't support deadlines, or due to
1633
- // checking for deadline periodically rather than for every key if
1634
- // processing a batch
1635
- std::chrono::microseconds deadline;
1636
-
1637
- // A timeout in microseconds to be passed to the underlying FileSystem for
1638
- // reads. As opposed to deadline, this determines the timeout for each
1639
- // individual file read request. If a MultiGet/Get/Seek/Next etc call
1640
- // results in multiple reads, each read can last up to io_timeout us.
1641
- std::chrono::microseconds io_timeout;
1642
-
1643
- // It limits the maximum cumulative value size of the keys in batch while
1644
- // reading through MultiGet. Once the cumulative value size exceeds this
1645
- // soft limit then all the remaining keys are returned with status Aborted.
1646
- //
1647
- // Default: std::numeric_limits<uint64_t>::max()
1648
- uint64_t value_size_soft_limit;
1649
-
1650
- // For iterators, RocksDB does auto-readahead on noticing more than two
1651
- // sequential reads for a table file if user doesn't provide readahead_size.
1652
- // The readahead starts at 8KB and doubles on every additional read upto
1653
- // max_auto_readahead_size only when reads are sequential. However at each
1654
- // level, if iterator moves over next file, readahead_size starts again from
1655
- // 8KB.
1656
- //
1657
- // By enabling this option, RocksDB will do some enhancements for
1658
- // prefetching the data.
1659
- //
1660
- // Default: false
1661
- bool adaptive_readahead;
1662
-
1663
- // For file reads associated with this option, charge the internal rate
1664
- // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
1665
- // special value `Env::IO_TOTAL` disables charging the rate limiter.
1666
- //
1667
- // The rate limiting is bypassed no matter this option's value for file reads
1668
- // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
1669
- // is a `PlainTableFactory`) and cuckoo tables (these can exist when
1670
- // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
1671
- //
1672
- // The bytes charged to rate limiter may not exactly match the file read bytes
1673
- // since there are some seemingly insignificant reads, like for file
1674
- // headers/footers, that we currently do not charge to rate limiter.
1675
- //
1676
- // Default: `Env::IO_TOTAL`.
1677
- Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
1678
-
1679
- // Experimental
1680
- //
1681
- // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
1682
- // RocksDB apply it if reads are sequential and its internal automatic
1683
- // prefetching.
1684
- //
1685
- // Default: false
1686
- bool async_io;
1687
-
1688
- // Experimental
1689
- //
1690
- // If async_io is set, then this flag controls whether we read SST files
1691
- // in multiple levels asynchronously. Enabling this flag can help reduce
1692
- // MultiGet latency by maximizing the number of SST files read in
1693
- // parallel if the keys in the MultiGet batch are in different levels. It
1694
- // comes at the expense of slightly higher CPU overhead.
1695
- //
1696
- // Default: true
1697
- bool optimize_multiget_for_io;
1680
+ // ** For RocksDB internal use only **
1681
+ Env::IOActivity io_activity = Env::IOActivity::kUnknown;
1698
1682
 
1699
- ReadOptions();
1700
- ReadOptions(bool cksum, bool cache);
1683
+ ReadOptions() {}
1684
+ ReadOptions(bool _verify_checksums, bool _fill_cache);
1685
+ explicit ReadOptions(Env::IOActivity _io_activity);
1701
1686
  };
1702
1687
 
1703
1688
  // Options that control write operations
@@ -14,13 +14,19 @@
14
14
 
15
15
  namespace ROCKSDB_NAMESPACE {
16
16
 
17
- // A thread local context for gathering performance counter efficiently
18
- // and transparently.
19
- // Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
17
+ /*
18
+ * NOTE:
19
+ * Please do not reorder the fields in this structure. If you plan to do that or
20
+ * add/remove fields to this structure, builds would fail. The way to fix the
21
+ * builds would be to add the appropriate fields to the
22
+ * DEF_PERF_CONTEXT_LEVEL_METRICS() macro in the perf_context.cc file.
23
+ */
20
24
 
21
25
  // Break down performance counters by level and store per-level perf context in
22
26
  // PerfContextByLevel
23
- struct PerfContextByLevel {
27
+ struct PerfContextByLevelBase {
28
+ // These Bloom stats apply to point reads (Get/MultiGet) for whole key and
29
+ // prefix filters.
24
30
  // # of times bloom filter has avoided file reads, i.e., negatives.
25
31
  uint64_t bloom_filter_useful = 0;
26
32
  // # of times bloom FullFilter has not avoided the reads.
@@ -38,37 +44,34 @@ struct PerfContextByLevel {
38
44
 
39
45
  uint64_t block_cache_hit_count = 0; // total number of block cache hits
40
46
  uint64_t block_cache_miss_count = 0; // total number of block cache misses
41
-
42
- void Reset(); // reset all performance counters to zero
43
47
  };
44
48
 
45
- struct PerfContext {
46
- ~PerfContext();
47
-
48
- PerfContext() {}
49
-
50
- PerfContext(const PerfContext&);
51
- PerfContext& operator=(const PerfContext&);
52
- PerfContext(PerfContext&&) noexcept;
49
+ // A thread local context for gathering performance counter efficiently
50
+ // and transparently.
51
+ // Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
53
52
 
53
+ // Break down performance counters by level and store per-level perf context in
54
+ // PerfContextByLevel
55
+ struct PerfContextByLevel : public PerfContextByLevelBase {
54
56
  void Reset(); // reset all performance counters to zero
57
+ };
55
58
 
56
- std::string ToString(bool exclude_zero_counters = false) const;
57
-
58
- // enable per level perf context and allocate storage for PerfContextByLevel
59
- void EnablePerLevelPerfContext();
60
-
61
- // temporarily disable per level perf context by setting the flag to false
62
- void DisablePerLevelPerfContext();
63
-
64
- // free the space for PerfContextByLevel, also disable per level perf context
65
- void ClearPerLevelPerfContext();
59
+ /*
60
+ * NOTE:
61
+ * Please do not reorder the fields in this structure. If you plan to do that or
62
+ * add/remove fields to this structure, builds would fail. The way to fix the
63
+ * builds would be to add the appropriate fields to the
64
+ * DEF_PERF_CONTEXT_METRICS() macro in the perf_context.cc file.
65
+ */
66
66
 
67
+ struct PerfContextBase {
67
68
  uint64_t user_key_comparison_count; // total number of user key comparisons
68
69
  uint64_t block_cache_hit_count; // total number of block cache hits
69
70
  uint64_t block_read_count; // total number of block reads (with IO)
70
71
  uint64_t block_read_byte; // total number of bytes from block reads
71
72
  uint64_t block_read_time; // total nanos spent on block reads
73
+ // total cpu time in nanos spent on block reads
74
+ uint64_t block_read_cpu_time;
72
75
  uint64_t block_cache_index_hit_count; // total number of index block hits
73
76
  // total number of standalone handles lookup from secondary cache
74
77
  uint64_t block_cache_standalone_handle_count;
@@ -216,9 +219,9 @@ struct PerfContext {
216
219
  uint64_t bloom_memtable_hit_count;
217
220
  // total number of mem table bloom misses
218
221
  uint64_t bloom_memtable_miss_count;
219
- // total number of SST table bloom hits
222
+ // total number of SST bloom hits
220
223
  uint64_t bloom_sst_hit_count;
221
- // total number of SST table bloom misses
224
+ // total number of SST bloom misses
222
225
  uint64_t bloom_sst_miss_count;
223
226
 
224
227
  // Time spent waiting on key locks in transaction lock manager.
@@ -254,15 +257,47 @@ struct PerfContext {
254
257
  uint64_t iter_prev_cpu_nanos;
255
258
  uint64_t iter_seek_cpu_nanos;
256
259
 
260
+ // EXPERIMENTAL
261
+ // Total number of db iterator's Next(), Prev(), Seek-related APIs being
262
+ // called
263
+ uint64_t iter_next_count;
264
+ uint64_t iter_prev_count;
265
+ uint64_t iter_seek_count;
266
+
257
267
  // Time spent in encrypting data. Populated when EncryptedEnv is used.
258
268
  uint64_t encrypt_data_nanos;
259
269
  // Time spent in decrypting data. Populated when EncryptedEnv is used.
260
270
  uint64_t decrypt_data_nanos;
261
271
 
262
272
  uint64_t number_async_seek;
273
+ };
274
+
275
+ struct PerfContext : public PerfContextBase {
276
+ ~PerfContext();
277
+
278
+ PerfContext() {}
279
+
280
+ PerfContext(const PerfContext&);
281
+ PerfContext& operator=(const PerfContext&);
282
+ PerfContext(PerfContext&&) noexcept;
283
+
284
+ void Reset(); // reset all performance counters to zero
285
+
286
+ std::string ToString(bool exclude_zero_counters = false) const;
287
+
288
+ // enable per level perf context and allocate storage for PerfContextByLevel
289
+ void EnablePerLevelPerfContext();
290
+
291
+ // temporarily disable per level perf context by setting the flag to false
292
+ void DisablePerLevelPerfContext();
293
+
294
+ // free the space for PerfContextByLevel, also disable per level perf context
295
+ void ClearPerLevelPerfContext();
263
296
 
264
297
  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
265
298
  bool per_level_perf_context_enabled = false;
299
+
300
+ void copyMetrics(const PerfContext* other) noexcept;
266
301
  };
267
302
 
268
303
  // If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
@@ -17,11 +17,23 @@
17
17
 
18
18
  namespace ROCKSDB_NAMESPACE {
19
19
 
20
- // A handle for lookup result. The handle may not be immediately ready or
21
- // have a valid value. The caller must call isReady() to determine if its
22
- // ready, and call Wait() in order to block until it becomes ready.
23
- // The caller must call Value() after it becomes ready to determine if the
24
- // handle successfullly read the item.
20
+ // A handle for lookup result. Immediately after SecondaryCache::Lookup() with
21
+ // wait=false (and depending on the implementation), the handle could be in any
22
+ // of the below states. It must not be destroyed while in the pending state.
23
+ // * Pending state (IsReady() == false): result is not ready. Value() and Size()
24
+ // must not be called.
25
+ // * Ready + not found state (IsReady() == true, Value() == nullptr): the lookup
26
+ // has completed, finding no match. Or an error occurred that prevented
27
+ // normal completion of the Lookup.
28
+ // * Ready + found state (IsReady() == false, Value() != nullptr): the lookup
29
+ // has completed, finding an entry that has been loaded into an object that is
30
+ // now owned by the caller.
31
+ //
32
+ // Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to
33
+ // return true, but (depending on the implementation) IsReady() might never
34
+ // return true without Wait() or SecondaryCache::WaitAll(). After the handle
35
+ // is known ready, calling Value() is required to avoid a memory leak in case
36
+ // of a cache hit.
25
37
  class SecondaryCacheResultHandle {
26
38
  public:
27
39
  virtual ~SecondaryCacheResultHandle() = default;
@@ -36,7 +48,9 @@ class SecondaryCacheResultHandle {
36
48
  // the lookup was unsuccessful.
37
49
  virtual Cache::ObjectPtr Value() = 0;
38
50
 
39
- // Return the size of value
51
+ // Return the out_charge from the helper->create_cb used to construct the
52
+ // object.
53
+ // WART: potentially confusing name
40
54
  virtual size_t Size() = 0;
41
55
  };
42
56
 
@@ -57,24 +71,13 @@ class SecondaryCache : public Customizable {
57
71
  const std::string& id,
58
72
  std::shared_ptr<SecondaryCache>* result);
59
73
 
60
- // Insert the given value into this cache. Ownership of `value` is
61
- // transferred to the callee, who is reponsible for deleting the value
62
- // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(),
63
- // the callee is responsible for such cleanup even in case of non-OK
64
- // Status.
65
- // Typically, the value is not saved directly but the implementation
66
- // uses the SaveToCallback provided by helper to extract value's
67
- // persistable data (typically uncompressed block), which will be written
68
- // to this tier. The implementation may or may not write it to cache
69
- // depending on the admission control policy, even if the return status
70
- // is success (OK).
71
- //
72
- // If the implementation is asynchronous or otherwise uses `value` after
73
- // the call returns, then InsertSaved() must be overridden not to rely on
74
- // Insert(). For example, there could be a "holding area" in memory where
75
- // Lookup() might return the same parsed value back. But more typically, if
76
- // the implementation only uses `value` for getting persistable data during
77
- // the call, then the default implementation of `InsertSaved()` suffices.
74
+ // Suggest inserting an entry into this cache. The caller retains ownership
75
+ // of `obj` (also called the "value"), so is only used directly by the
76
+ // SecondaryCache during Insert(). When the cache chooses to perform the
77
+ // suggested insertion, it uses the size_cb and saveto_cb provided by
78
+ // `helper` to extract the persistable data (typically an uncompressed block)
79
+ // and writes it to this cache tier. OK may be returned even if the insertion
80
+ // is not made.
78
81
  virtual Status Insert(const Slice& key, Cache::ObjectPtr obj,
79
82
  const Cache::CacheItemHelper* helper) = 0;
80
83
 
@@ -84,8 +87,9 @@ class SecondaryCache : public Customizable {
84
87
  // may or may not write it to cache depending on the admission control
85
88
  // policy, even if the return status is success.
86
89
  //
87
- // The default implementation assumes synchronous, non-escaping Insert(),
88
- // wherein `value` is not used after return of Insert(). See Insert().
90
+ // The default implementation only assumes the entry helper's create_cb is
91
+ // called at Lookup() time and not Insert() time, so should work for all
92
+ // foreseeable implementations.
89
93
  virtual Status InsertSaved(const Slice& key, const Slice& saved);
90
94
 
91
95
  // Lookup the data for the given key in this cache. The create_cb