@nxtedition/rocksdb 8.2.0 → 8.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. package/binding.cc +3 -3
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +16 -52
  3. package/deps/rocksdb/rocksdb/Makefile +10 -5
  4. package/deps/rocksdb/rocksdb/TARGETS +8 -345
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +92 -0
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +32 -32
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +12 -9
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +6 -43
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +3 -13
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +8 -5
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +21 -47
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +3 -8
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +1 -2
  15. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +44 -7
  16. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +13 -14
  17. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -0
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +2 -1
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +17 -8
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +40 -21
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +41 -42
  25. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +1 -1
  26. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +1 -1
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +5 -4
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +2 -2
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +5 -3
  30. package/deps/rocksdb/rocksdb/db/builder.cc +7 -6
  31. package/deps/rocksdb/rocksdb/db/builder.h +2 -2
  32. package/deps/rocksdb/rocksdb/db/c.cc +76 -5
  33. package/deps/rocksdb/rocksdb/db/c_test.c +141 -0
  34. package/deps/rocksdb/rocksdb/db/column_family.cc +32 -0
  35. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +3 -2
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +5 -0
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +8 -5
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +12 -10
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +21 -17
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -7
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +3 -1
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +77 -50
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +4 -5
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +55 -8
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +142 -56
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +1 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +1 -2
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +21 -20
  51. package/deps/rocksdb/rocksdb/db/convenience.cc +8 -6
  52. package/deps/rocksdb/rocksdb/db/corruption_test.cc +5 -4
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +6 -3
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +260 -220
  55. package/deps/rocksdb/rocksdb/db/db_clip_test.cc +142 -0
  56. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +1 -1
  57. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +333 -27
  58. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +5 -0
  59. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +7 -0
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +189 -27
  61. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +23 -10
  62. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +134 -90
  63. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +2 -2
  64. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -3
  65. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -1
  66. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +124 -16
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +10 -0
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +7 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +15 -0
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +11 -5
  71. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -8
  72. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +54 -3
  73. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +42 -0
  74. package/deps/rocksdb/rocksdb/db/db_options_test.cc +116 -1
  75. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -2
  76. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +3 -2
  77. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +9 -8
  78. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +142 -63
  79. package/deps/rocksdb/rocksdb/db/db_test.cc +28 -7
  80. package/deps/rocksdb/rocksdb/db/db_test2.cc +71 -131
  81. package/deps/rocksdb/rocksdb/db/db_test_util.cc +18 -0
  82. package/deps/rocksdb/rocksdb/db/db_test_util.h +6 -0
  83. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +10 -10
  84. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +25 -0
  85. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +88 -0
  86. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +67 -0
  87. package/deps/rocksdb/rocksdb/db/db_write_test.cc +5 -0
  88. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/experimental.cc +4 -2
  90. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +86 -1
  91. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +15 -2
  92. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -2
  93. package/deps/rocksdb/rocksdb/db/flush_job.cc +21 -14
  94. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -7
  95. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +31 -8
  96. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +21 -19
  97. package/deps/rocksdb/rocksdb/db/internal_stats.cc +42 -12
  98. package/deps/rocksdb/rocksdb/db/internal_stats.h +1 -0
  99. package/deps/rocksdb/rocksdb/db/kv_checksum.h +92 -6
  100. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -2
  101. package/deps/rocksdb/rocksdb/db/log_format.h +8 -4
  102. package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -51
  103. package/deps/rocksdb/rocksdb/db/log_reader.h +16 -0
  104. package/deps/rocksdb/rocksdb/db/log_test.cc +125 -4
  105. package/deps/rocksdb/rocksdb/db/log_writer.cc +32 -2
  106. package/deps/rocksdb/rocksdb/db/log_writer.h +16 -0
  107. package/deps/rocksdb/rocksdb/db/memtable.cc +17 -46
  108. package/deps/rocksdb/rocksdb/db/memtable.h +1 -1
  109. package/deps/rocksdb/rocksdb/db/memtable_list.cc +8 -4
  110. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -1
  111. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +5 -4
  113. package/deps/rocksdb/rocksdb/db/repair.cc +38 -11
  114. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +3 -3
  115. package/deps/rocksdb/rocksdb/db/table_cache.cc +68 -51
  116. package/deps/rocksdb/rocksdb/db/table_cache.h +20 -10
  117. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -1
  118. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +6 -3
  119. package/deps/rocksdb/rocksdb/db/version_builder.cc +9 -5
  120. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  121. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +140 -120
  122. package/deps/rocksdb/rocksdb/db/version_edit.cc +14 -0
  123. package/deps/rocksdb/rocksdb/db/version_edit.h +12 -4
  124. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +21 -13
  125. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +26 -16
  126. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +9 -9
  127. package/deps/rocksdb/rocksdb/db/version_set.cc +292 -96
  128. package/deps/rocksdb/rocksdb/db/version_set.h +53 -28
  129. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -0
  130. package/deps/rocksdb/rocksdb/db/version_set_test.cc +62 -22
  131. package/deps/rocksdb/rocksdb/db/version_util.h +5 -4
  132. package/deps/rocksdb/rocksdb/db/write_batch.cc +3 -1
  133. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  134. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +119 -27
  135. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +123 -0
  136. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  137. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +7 -2
  138. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +34 -0
  139. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -0
  140. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +43 -33
  141. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +29 -17
  142. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  143. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -1
  144. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +85 -50
  145. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +96 -54
  146. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.cc +122 -0
  147. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +206 -0
  148. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +9 -1
  149. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +9 -3
  150. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +322 -92
  151. package/deps/rocksdb/rocksdb/env/env_posix.cc +12 -8
  152. package/deps/rocksdb/rocksdb/env/env_test.cc +31 -0
  153. package/deps/rocksdb/rocksdb/env/mock_env.cc +1 -1
  154. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +14 -0
  155. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +1 -1
  156. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +5 -1
  157. package/deps/rocksdb/rocksdb/file/file_util.cc +3 -3
  158. package/deps/rocksdb/rocksdb/file/file_util.h +2 -0
  159. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +89 -0
  160. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +22 -7
  161. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -2
  162. package/deps/rocksdb/rocksdb/file/readahead_raf.cc +1 -1
  163. package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +1 -1
  164. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +1 -1
  165. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +3 -0
  166. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +154 -74
  167. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +27 -7
  168. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +107 -28
  169. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +19 -0
  170. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +8 -0
  171. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +2 -0
  172. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +7 -1
  173. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +137 -152
  174. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +61 -26
  175. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +30 -26
  176. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +33 -16
  177. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +87 -8
  178. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
  180. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  181. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +1 -0
  182. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -0
  183. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -1
  184. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  185. package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +9 -2
  186. package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -0
  187. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +78 -42
  188. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +14 -9
  189. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +1 -0
  190. package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -0
  191. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +4 -9
  192. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +19 -11
  193. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +1 -1
  194. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +211 -555
  195. package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +1 -1
  196. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +36 -2
  197. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +17 -7
  198. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.h +10 -7
  199. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +19 -18
  200. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.h +10 -2
  201. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +14 -0
  202. package/deps/rocksdb/rocksdb/options/cf_options.cc +35 -2
  203. package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
  204. package/deps/rocksdb/rocksdb/options/customizable_test.cc +1 -1
  205. package/deps/rocksdb/rocksdb/options/options.cc +12 -53
  206. package/deps/rocksdb/rocksdb/options/options_helper.cc +4 -0
  207. package/deps/rocksdb/rocksdb/options/options_parser.cc +11 -0
  208. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +32 -4
  209. package/deps/rocksdb/rocksdb/options/options_test.cc +89 -5
  210. package/deps/rocksdb/rocksdb/port/lang.h +27 -0
  211. package/deps/rocksdb/rocksdb/port/stack_trace.cc +67 -24
  212. package/deps/rocksdb/rocksdb/src.mk +2 -0
  213. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -3
  214. package/deps/rocksdb/rocksdb/table/block_based/block.cc +195 -35
  215. package/deps/rocksdb/rocksdb/table/block_based/block.h +197 -24
  216. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +71 -51
  217. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +7 -1
  218. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +4 -6
  219. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +3 -0
  220. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +43 -2
  221. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +36 -6
  222. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +266 -166
  223. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +44 -14
  224. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -1
  225. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +63 -56
  226. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +8 -2
  227. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +4 -2
  228. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +10 -0
  229. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +14 -2
  230. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +918 -2
  231. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -2
  232. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +10 -9
  233. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +6 -8
  234. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +2 -2
  235. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +1 -1
  236. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +18 -23
  237. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +8 -8
  238. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +16 -32
  239. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +7 -8
  240. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +4 -5
  241. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +3 -3
  242. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +46 -53
  243. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +12 -12
  244. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +7 -9
  245. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +26 -23
  246. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +2 -1
  247. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +3 -0
  248. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -2
  249. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +3 -2
  250. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +7 -1
  251. package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
  252. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  253. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +3 -2
  254. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +5 -2
  255. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +4 -2
  256. package/deps/rocksdb/rocksdb/table/format.cc +4 -4
  257. package/deps/rocksdb/rocksdb/table/format.h +1 -1
  258. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -1
  259. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +33 -22
  260. package/deps/rocksdb/rocksdb/table/meta_blocks.h +4 -0
  261. package/deps/rocksdb/rocksdb/table/mock_table.cc +4 -2
  262. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +1 -1
  264. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +18 -10
  265. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -3
  266. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +10 -7
  267. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +4 -2
  268. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +11 -0
  269. package/deps/rocksdb/rocksdb/table/table_builder.h +14 -5
  270. package/deps/rocksdb/rocksdb/table/table_properties.cc +2 -0
  271. package/deps/rocksdb/rocksdb/table/table_reader.h +6 -3
  272. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +1 -1
  273. package/deps/rocksdb/rocksdb/table/table_test.cc +291 -34
  274. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +3 -1
  275. package/deps/rocksdb/rocksdb/test_util/testharness.h +5 -0
  276. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -2
  277. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +33 -17
  278. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +3 -1
  279. package/deps/rocksdb/rocksdb/util/bloom_impl.h +2 -2
  280. package/deps/rocksdb/rocksdb/util/compression.h +1 -1
  281. package/deps/rocksdb/rocksdb/util/crc32c.cc +24 -83
  282. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +7 -9
  283. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +4 -1
  284. package/deps/rocksdb/rocksdb/util/filter_bench.cc +1 -1
  285. package/deps/rocksdb/rocksdb/util/gflags_compat.h +9 -10
  286. package/deps/rocksdb/rocksdb/util/math.h +12 -7
  287. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +16 -18
  288. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +46 -2
  289. package/deps/rocksdb/rocksdb/util/ribbon_test.cc +6 -6
  290. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +12 -7
  291. package/deps/rocksdb/rocksdb/util/stop_watch.h +31 -13
  292. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +2 -0
  293. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
  294. package/deps/rocksdb/rocksdb/util/udt_util.h +77 -0
  295. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +2 -2
  296. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +1 -1
  297. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +1 -1
  298. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +1 -1
  299. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +1 -1
  300. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -1
  301. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +11 -1
  302. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +34 -1
  303. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +15 -0
  304. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +1 -1
  305. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +5 -1
  306. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +29 -1
  307. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +0 -1
  308. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -1
  309. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +6 -1
  310. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +10 -0
  311. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +6 -1
  312. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +5 -0
  313. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -0
  314. package/package.json +1 -1
  315. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  316. package/prebuilds/linux-x64/node.napi.node +0 -0
  317. /package/deps/rocksdb/rocksdb/memory/{memory_allocator.h → memory_allocator_impl.h} +0 -0
  318. /package/deps/rocksdb/rocksdb/monitoring/{statistics.h → statistics_impl.h} +0 -0
  319. /package/deps/rocksdb/rocksdb/table/block_based/{flush_block_policy.h → flush_block_policy_impl.h} +0 -0
  320. /package/deps/rocksdb/rocksdb/util/{rate_limiter.h → rate_limiter_impl.h} +0 -0
  321. /package/deps/rocksdb/rocksdb/utilities/agg_merge/{agg_merge.h → agg_merge_impl.h} +0 -0
@@ -37,6 +37,7 @@
37
37
  #include "db/pinned_iterators_manager.h"
38
38
  #include "db/table_cache.h"
39
39
  #include "db/version_builder.h"
40
+ #include "db/version_edit.h"
40
41
  #include "db/version_edit_handler.h"
41
42
  #include "table/compaction_merging_iterator.h"
42
43
 
@@ -941,7 +942,7 @@ class LevelIterator final : public InternalIterator {
941
942
  const std::shared_ptr<const SliceTransform>& prefix_extractor,
942
943
  bool should_sample, HistogramImpl* file_read_hist,
943
944
  TableReaderCaller caller, bool skip_filters, int level,
944
- RangeDelAggregator* range_del_agg,
945
+ uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg,
945
946
  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
946
947
  nullptr,
947
948
  bool allow_unprepared_value = false,
@@ -964,6 +965,7 @@ class LevelIterator final : public InternalIterator {
964
965
  pinned_iters_mgr_(nullptr),
965
966
  compaction_boundaries_(compaction_boundaries),
966
967
  is_next_read_sequential_(false),
968
+ block_protection_bytes_per_key_(block_protection_bytes_per_key),
967
969
  range_tombstone_iter_(nullptr),
968
970
  to_return_sentinel_(false) {
969
971
  // Empty level is not supported.
@@ -1107,7 +1109,8 @@ class LevelIterator final : public InternalIterator {
1107
1109
  nullptr /* don't need reference to table */, file_read_hist_, caller_,
1108
1110
  /*arena=*/nullptr, skip_filters_, level_,
1109
1111
  /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
1110
- largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
1112
+ largest_compaction_key, allow_unprepared_value_,
1113
+ block_protection_bytes_per_key_, range_tombstone_iter_);
1111
1114
  }
1112
1115
 
1113
1116
  // Check if current file being fully within iterate_lower_bound.
@@ -1154,6 +1157,8 @@ class LevelIterator final : public InternalIterator {
1154
1157
 
1155
1158
  bool is_next_read_sequential_;
1156
1159
 
1160
+ uint8_t block_protection_bytes_per_key_;
1161
+
1157
1162
  // This is set when this level iterator is used under a merging iterator
1158
1163
  // that processes range tombstones. range_tombstone_iter_ points to where the
1159
1164
  // merging iterator stores the range tombstones iterator for this level. When
@@ -1527,13 +1532,15 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
1527
1532
  }
1528
1533
  } // anonymous namespace
1529
1534
 
1530
- Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
1535
+ Status Version::GetTableProperties(const ReadOptions& read_options,
1536
+ std::shared_ptr<const TableProperties>* tp,
1531
1537
  const FileMetaData* file_meta,
1532
1538
  const std::string* fname) const {
1533
1539
  auto table_cache = cfd_->table_cache();
1534
1540
  auto ioptions = cfd_->ioptions();
1535
1541
  Status s = table_cache->GetTableProperties(
1536
- file_options_, cfd_->internal_comparator(), *file_meta, tp,
1542
+ file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp,
1543
+ mutable_cf_options_.block_protection_bytes_per_key,
1537
1544
  mutable_cf_options_.prefix_extractor, true /* no io */);
1538
1545
  if (s.ok()) {
1539
1546
  return s;
@@ -1565,14 +1572,16 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
1565
1572
  // the magic number check in the footer.
1566
1573
  std::unique_ptr<RandomAccessFileReader> file_reader(
1567
1574
  new RandomAccessFileReader(
1568
- std::move(file), file_name, nullptr /* env */, io_tracer_,
1569
- nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
1570
- nullptr /* rate_limiter */, ioptions->listeners));
1575
+ std::move(file), file_name, ioptions->clock /* clock */, io_tracer_,
1576
+ ioptions->stats /* stats */,
1577
+ Histograms::SST_READ_MICROS /* hist_type */,
1578
+ nullptr /* file_read_hist */, nullptr /* rate_limiter */,
1579
+ ioptions->listeners));
1571
1580
  std::unique_ptr<TableProperties> props;
1572
1581
  s = ReadTableProperties(
1573
1582
  file_reader.get(), file_meta->fd.GetFileSize(),
1574
1583
  Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
1575
- &props);
1584
+ read_options, &props);
1576
1585
  if (!s.ok()) {
1577
1586
  return s;
1578
1587
  }
@@ -1581,10 +1590,11 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
1581
1590
  return s;
1582
1591
  }
1583
1592
 
1584
- Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
1593
+ Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
1594
+ TablePropertiesCollection* props) {
1585
1595
  Status s;
1586
1596
  for (int level = 0; level < storage_info_.num_levels_; level++) {
1587
- s = GetPropertiesOfAllTables(props, level);
1597
+ s = GetPropertiesOfAllTables(read_options, props, level);
1588
1598
  if (!s.ok()) {
1589
1599
  return s;
1590
1600
  }
@@ -1602,6 +1612,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
1602
1612
 
1603
1613
  std::stringstream ss;
1604
1614
 
1615
+ // TODO: plumb Env::IOActivity
1616
+ const ReadOptions read_options;
1605
1617
  for (int level = 0; level < storage_info_.num_levels_; level++) {
1606
1618
  for (const auto& file_meta : storage_info_.files_[level]) {
1607
1619
  auto fname =
@@ -1614,7 +1626,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
1614
1626
  std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
1615
1627
 
1616
1628
  Status s = table_cache->GetRangeTombstoneIterator(
1617
- ReadOptions(), cfd_->internal_comparator(), *file_meta,
1629
+ read_options, cfd_->internal_comparator(), *file_meta,
1630
+ cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key,
1618
1631
  &tombstone_iter);
1619
1632
  if (!s.ok()) {
1620
1633
  return s;
@@ -1648,7 +1661,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
1648
1661
  return Status::OK();
1649
1662
  }
1650
1663
 
1651
- Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
1664
+ Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
1665
+ TablePropertiesCollection* props,
1652
1666
  int level) {
1653
1667
  for (const auto& file_meta : storage_info_.files_[level]) {
1654
1668
  auto fname =
@@ -1657,7 +1671,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
1657
1671
  // 1. If the table is already present in table cache, load table
1658
1672
  // properties from there.
1659
1673
  std::shared_ptr<const TableProperties> table_properties;
1660
- Status s = GetTableProperties(&table_properties, file_meta, &fname);
1674
+ Status s =
1675
+ GetTableProperties(read_options, &table_properties, file_meta, &fname);
1661
1676
  if (s.ok()) {
1662
1677
  props->insert({fname, table_properties});
1663
1678
  } else {
@@ -1669,7 +1684,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
1669
1684
  }
1670
1685
 
1671
1686
  Status Version::GetPropertiesOfTablesInRange(
1672
- const Range* range, std::size_t n, TablePropertiesCollection* props) const {
1687
+ const ReadOptions& read_options, const Range* range, std::size_t n,
1688
+ TablePropertiesCollection* props) const {
1673
1689
  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
1674
1690
  for (decltype(n) i = 0; i < n; i++) {
1675
1691
  // Convert user_key into a corresponding internal key.
@@ -1686,7 +1702,8 @@ Status Version::GetPropertiesOfTablesInRange(
1686
1702
  // 1. If the table is already present in table cache, load table
1687
1703
  // properties from there.
1688
1704
  std::shared_ptr<const TableProperties> table_properties;
1689
- Status s = GetTableProperties(&table_properties, file_meta, &fname);
1705
+ Status s = GetTableProperties(read_options, &table_properties,
1706
+ file_meta, &fname);
1690
1707
  if (s.ok()) {
1691
1708
  props->insert({fname, table_properties});
1692
1709
  } else {
@@ -1701,13 +1718,14 @@ Status Version::GetPropertiesOfTablesInRange(
1701
1718
  }
1702
1719
 
1703
1720
  Status Version::GetAggregatedTableProperties(
1704
- std::shared_ptr<const TableProperties>* tp, int level) {
1721
+ const ReadOptions& read_options, std::shared_ptr<const TableProperties>* tp,
1722
+ int level) {
1705
1723
  TablePropertiesCollection props;
1706
1724
  Status s;
1707
1725
  if (level < 0) {
1708
- s = GetPropertiesOfAllTables(&props);
1726
+ s = GetPropertiesOfAllTables(read_options, &props);
1709
1727
  } else {
1710
- s = GetPropertiesOfAllTables(&props, level);
1728
+ s = GetPropertiesOfAllTables(read_options, &props, level);
1711
1729
  }
1712
1730
  if (!s.ok()) {
1713
1731
  return s;
@@ -1721,13 +1739,14 @@ Status Version::GetAggregatedTableProperties(
1721
1739
  return Status::OK();
1722
1740
  }
1723
1741
 
1724
- size_t Version::GetMemoryUsageByTableReaders() {
1742
+ size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) {
1725
1743
  size_t total_usage = 0;
1726
1744
  for (auto& file_level : storage_info_.level_files_brief_) {
1727
1745
  for (size_t i = 0; i < file_level.num_files; i++) {
1728
1746
  total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
1729
- file_options_, cfd_->internal_comparator(),
1747
+ file_options_, read_options, cfd_->internal_comparator(),
1730
1748
  *file_level.files[i].file_metadata,
1749
+ mutable_cf_options_.block_protection_bytes_per_key,
1731
1750
  mutable_cf_options_.prefix_extractor);
1732
1751
  }
1733
1752
  }
@@ -1807,6 +1826,49 @@ uint64_t Version::GetSstFilesSize() {
1807
1826
  return sst_files_size;
1808
1827
  }
1809
1828
 
1829
+ void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key,
1830
+ Slice* largest_user_key) {
1831
+ smallest_user_key->clear();
1832
+ largest_user_key->clear();
1833
+ bool initialized = false;
1834
+ const Comparator* ucmp = storage_info_.user_comparator_;
1835
+ for (int level = 0; level < cfd_->NumberLevels(); level++) {
1836
+ if (storage_info_.LevelFiles(level).size() == 0) {
1837
+ continue;
1838
+ }
1839
+ if (level == 0) {
1840
+ // we need to consider all files on level 0
1841
+ for (const auto& file : storage_info_.LevelFiles(level)) {
1842
+ const Slice& start_user_key = file->smallest.user_key();
1843
+ if (!initialized ||
1844
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
1845
+ *smallest_user_key = start_user_key;
1846
+ }
1847
+ const Slice& end_user_key = file->largest.user_key();
1848
+ if (!initialized ||
1849
+ ucmp->Compare(end_user_key, *largest_user_key) > 0) {
1850
+ *largest_user_key = end_user_key;
1851
+ }
1852
+ initialized = true;
1853
+ }
1854
+ } else {
1855
+ // we only need to consider the first and last file
1856
+ const Slice& start_user_key =
1857
+ storage_info_.LevelFiles(level)[0]->smallest.user_key();
1858
+ if (!initialized ||
1859
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
1860
+ *smallest_user_key = start_user_key;
1861
+ }
1862
+ const Slice& end_user_key =
1863
+ storage_info_.LevelFiles(level).back()->largest.user_key();
1864
+ if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
1865
+ *largest_user_key = end_user_key;
1866
+ }
1867
+ initialized = true;
1868
+ }
1869
+ }
1870
+ }
1871
+
1810
1872
  void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
1811
1873
  uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
1812
1874
  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
@@ -1837,6 +1899,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
1837
1899
  mutable_cf_options_.prefix_extractor, should_sample_file_read(),
1838
1900
  cfd_->internal_stats()->GetFileReadHist(level),
1839
1901
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
1902
+ mutable_cf_options_.block_protection_bytes_per_key,
1840
1903
  nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
1841
1904
  allow_unprepared_value, &tombstone_iter_ptr);
1842
1905
  if (read_options.ignore_range_deletions) {
@@ -1935,7 +1998,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1935
1998
  /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
1936
1999
  /*smallest_compaction_key=*/nullptr,
1937
2000
  /*largest_compaction_key=*/nullptr, allow_unprepared_value,
1938
- &tombstone_iter);
2001
+ mutable_cf_options_.block_protection_bytes_per_key, &tombstone_iter);
1939
2002
  if (read_options.ignore_range_deletions) {
1940
2003
  merge_iter_builder->AddIterator(table_iter);
1941
2004
  } else {
@@ -1964,8 +2027,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
1964
2027
  mutable_cf_options_.prefix_extractor, should_sample_file_read(),
1965
2028
  cfd_->internal_stats()->GetFileReadHist(level),
1966
2029
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
1967
- /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
1968
- allow_unprepared_value, &tombstone_iter_ptr);
2030
+ mutable_cf_options_.block_protection_bytes_per_key,
2031
+ /*range_del_agg=*/nullptr,
2032
+ /*compaction_boundaries=*/nullptr, allow_unprepared_value,
2033
+ &tombstone_iter_ptr);
1969
2034
  if (read_options.ignore_range_deletions) {
1970
2035
  merge_iter_builder->AddIterator(level_iter);
1971
2036
  } else {
@@ -2008,7 +2073,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
2008
2073
  /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
2009
2074
  /*smallest_compaction_key=*/nullptr,
2010
2075
  /*largest_compaction_key=*/nullptr,
2011
- /*allow_unprepared_value=*/false));
2076
+ /*allow_unprepared_value=*/false,
2077
+ mutable_cf_options_.block_protection_bytes_per_key));
2012
2078
  status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
2013
2079
  iter.get(), overlap);
2014
2080
  if (!status.ok() || *overlap) {
@@ -2023,7 +2089,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
2023
2089
  mutable_cf_options_.prefix_extractor, should_sample_file_read(),
2024
2090
  cfd_->internal_stats()->GetFileReadHist(level),
2025
2091
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
2026
- &range_del_agg));
2092
+ mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg,
2093
+ nullptr, false));
2027
2094
  status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
2028
2095
  iter.get(), overlap);
2029
2096
  }
@@ -2050,6 +2117,7 @@ VersionStorageInfo::VersionStorageInfo(
2050
2117
  compaction_style_(compaction_style),
2051
2118
  files_(new std::vector<FileMetaData*>[num_levels_]),
2052
2119
  base_level_(num_levels_ == 1 ? -1 : 1),
2120
+ lowest_unnecessary_level_(-1),
2053
2121
  level_multiplier_(0.0),
2054
2122
  files_by_compaction_pri_(num_levels_),
2055
2123
  level0_non_overlapping_(false),
@@ -2321,7 +2389,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
2321
2389
  StopWatchNano timer(clock_, timer_enabled /* auto_start */);
2322
2390
  *status = table_cache_->Get(
2323
2391
  read_options, *internal_comparator(), *f->file_metadata, ikey,
2324
- &get_context, mutable_cf_options_.prefix_extractor,
2392
+ &get_context, mutable_cf_options_.block_protection_bytes_per_key,
2393
+ mutable_cf_options_.prefix_extractor,
2325
2394
  cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2326
2395
  IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
2327
2396
  fp.IsHitFileLastInLevel()),
@@ -2566,7 +2635,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2566
2635
  read_options, *internal_comparator(), *f->file_metadata,
2567
2636
  mutable_cf_options_.prefix_extractor,
2568
2637
  cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2569
- fp.GetHitFileLevel(), &file_range, &table_handle);
2638
+ fp.GetHitFileLevel(), &file_range, &table_handle,
2639
+ mutable_cf_options_.block_protection_bytes_per_key);
2570
2640
  skip_range_deletions = true;
2571
2641
  if (status.ok()) {
2572
2642
  skip_filters = true;
@@ -2756,7 +2826,8 @@ Status Version::ProcessBatch(
2756
2826
  read_options, *internal_comparator(), *f->file_metadata,
2757
2827
  mutable_cf_options_.prefix_extractor,
2758
2828
  cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
2759
- fp.GetHitFileLevel(), &file_range, &table_handle);
2829
+ fp.GetHitFileLevel(), &file_range, &table_handle,
2830
+ mutable_cf_options_.block_protection_bytes_per_key);
2760
2831
  if (status.ok()) {
2761
2832
  skip_filters = true;
2762
2833
  skip_range_deletions = true;
@@ -2983,24 +3054,26 @@ void VersionStorageInfo::PrepareForVersionAppend(
2983
3054
  }
2984
3055
 
2985
3056
  void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options,
3057
+ const ReadOptions& read_options,
2986
3058
  bool update_stats) {
2987
3059
  TEST_SYNC_POINT_CALLBACK(
2988
3060
  "Version::PrepareAppend:forced_check",
2989
3061
  reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
2990
3062
 
2991
3063
  if (update_stats) {
2992
- UpdateAccumulatedStats();
3064
+ UpdateAccumulatedStats(read_options);
2993
3065
  }
2994
3066
 
2995
3067
  storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options);
2996
3068
  }
2997
3069
 
2998
- bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
3070
+ bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options,
3071
+ FileMetaData* file_meta) {
2999
3072
  if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) {
3000
3073
  return false;
3001
3074
  }
3002
3075
  std::shared_ptr<const TableProperties> tp;
3003
- Status s = GetTableProperties(&tp, file_meta);
3076
+ Status s = GetTableProperties(read_options, &tp, file_meta);
3004
3077
  file_meta->init_stats_from_file = true;
3005
3078
  if (!s.ok()) {
3006
3079
  ROCKS_LOG_ERROR(vset_->db_options_->info_log,
@@ -3045,7 +3118,7 @@ void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
3045
3118
  }
3046
3119
  }
3047
3120
 
3048
- void Version::UpdateAccumulatedStats() {
3121
+ void Version::UpdateAccumulatedStats(const ReadOptions& read_options) {
3049
3122
  // maximum number of table properties loaded from files.
3050
3123
  const int kMaxInitCount = 20;
3051
3124
  int init_count = 0;
@@ -3063,7 +3136,7 @@ void Version::UpdateAccumulatedStats() {
3063
3136
  level < storage_info_.num_levels_ && init_count < kMaxInitCount;
3064
3137
  ++level) {
3065
3138
  for (auto* file_meta : storage_info_.files_[level]) {
3066
- if (MaybeInitializeFileMetaData(file_meta)) {
3139
+ if (MaybeInitializeFileMetaData(read_options, file_meta)) {
3067
3140
  // each FileMeta will be initialized only once.
3068
3141
  storage_info_.UpdateAccumulatedStats(file_meta);
3069
3142
  // when option "max_open_files" is -1, all the file metadata has
@@ -3088,7 +3161,8 @@ void Version::UpdateAccumulatedStats() {
3088
3161
  storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
3089
3162
  for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
3090
3163
  storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
3091
- if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
3164
+ if (MaybeInitializeFileMetaData(read_options,
3165
+ storage_info_.files_[level][i])) {
3092
3166
  storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
3093
3167
  }
3094
3168
  }
@@ -3252,6 +3326,55 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
3252
3326
  }
3253
3327
  return ttl_expired_files_count;
3254
3328
  }
3329
+
3330
+ bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions,
3331
+ const MutableCFOptions& mutable_cf_options,
3332
+ const std::vector<FileMetaData*>& files) {
3333
+ const std::vector<FileTemperatureAge>& ages =
3334
+ mutable_cf_options.compaction_options_fifo
3335
+ .file_temperature_age_thresholds;
3336
+ if (ages.empty()) {
3337
+ return false;
3338
+ }
3339
+ if (files.empty()) {
3340
+ return false;
3341
+ }
3342
+ int64_t _current_time;
3343
+ auto status = ioptions.clock->GetCurrentTime(&_current_time);
3344
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
3345
+ // We use oldest_ancestor_time of a file to be the estimate age of
3346
+ // the file just older than it. This is the same logic used in
3347
+ // FIFOCompactionPicker::PickTemperatureChangeCompaction().
3348
+ if (status.ok() && current_time >= ages[0].age) {
3349
+ uint64_t create_time_threshold = current_time - ages[0].age;
3350
+ Temperature target_temp;
3351
+ assert(files.size() >= 1);
3352
+ for (size_t index = files.size() - 1; index >= 1; --index) {
3353
+ FileMetaData* cur_file = files[index];
3354
+ FileMetaData* prev_file = files[index - 1];
3355
+ if (!cur_file->being_compacted) {
3356
+ uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime();
3357
+ if (oldest_ancestor_time == kUnknownOldestAncesterTime) {
3358
+ return false;
3359
+ }
3360
+ if (oldest_ancestor_time > create_time_threshold) {
3361
+ return false;
3362
+ }
3363
+ target_temp = ages[0].temperature;
3364
+ for (size_t i = 1; i < ages.size(); ++i) {
3365
+ if (current_time >= ages[i].age &&
3366
+ oldest_ancestor_time <= current_time - ages[i].age) {
3367
+ target_temp = ages[i].temperature;
3368
+ }
3369
+ }
3370
+ if (cur_file->temperature != target_temp) {
3371
+ return true;
3372
+ }
3373
+ }
3374
+ }
3375
+ }
3376
+ return false;
3377
+ }
3255
3378
  } // anonymous namespace
3256
3379
 
3257
3380
  void VersionStorageInfo::ComputeCompactionScore(
@@ -3262,7 +3385,7 @@ void VersionStorageInfo::ComputeCompactionScore(
3262
3385
  // the level's target size, and 1.0 is the threshold for triggering
3263
3386
  // compaction. Higher score means higher prioritization.
3264
3387
  // Now we keep the compaction triggering condition, but consider more
3265
- // factors for priorization, while still keeping the 1.0 threshold.
3388
+ // factors for prioritization, while still keeping the 1.0 threshold.
3266
3389
  // In order to provide flexibility for reducing score while still
3267
3390
  // maintaining it to be over 1.0, we scale the original score by 10x
3268
3391
  // if it is larger than 1.0.
@@ -3295,7 +3418,7 @@ void VersionStorageInfo::ComputeCompactionScore(
3295
3418
  // compaction score for the whole DB. Adding other levels as if
3296
3419
  // they are L0 files.
3297
3420
  for (int i = 1; i < num_levels(); i++) {
3298
- // Its possible that a subset of the files in a level may be in a
3421
+ // It's possible that a subset of the files in a level may be in a
3299
3422
  // compaction, due to delete triggered compaction or trivial move.
3300
3423
  // In that case, the below check may not catch a level being
3301
3424
  // compacted as it only checks the first file. The worst that can
@@ -3309,22 +3432,25 @@ void VersionStorageInfo::ComputeCompactionScore(
3309
3432
  if (compaction_style_ == kCompactionStyleFIFO) {
3310
3433
  score = static_cast<double>(total_size) /
3311
3434
  mutable_cf_options.compaction_options_fifo.max_table_files_size;
3312
- if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
3313
- mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
3314
- // Warm tier move can happen at any time. It's too expensive to
3315
- // check very file's timestamp now. For now, just trigger it
3316
- // slightly more frequently than FIFO compaction so that this
3317
- // happens first.
3435
+ if (score < 1 &&
3436
+ mutable_cf_options.compaction_options_fifo.allow_compaction) {
3318
3437
  score = std::max(
3319
3438
  static_cast<double>(num_sorted_runs) /
3320
3439
  mutable_cf_options.level0_file_num_compaction_trigger,
3321
3440
  score);
3322
3441
  }
3323
- if (mutable_cf_options.ttl > 0) {
3324
- score = std::max(
3325
- static_cast<double>(GetExpiredTtlFilesCount(
3326
- immutable_options, mutable_cf_options, files_[level])),
3327
- score);
3442
+ if (score < 1 && mutable_cf_options.ttl > 0) {
3443
+ score =
3444
+ std::max(static_cast<double>(GetExpiredTtlFilesCount(
3445
+ immutable_options, mutable_cf_options, files_[0])),
3446
+ score);
3447
+ }
3448
+ if (score < 1 &&
3449
+ ShouldChangeFileTemperature(immutable_options, mutable_cf_options,
3450
+ files_[0])) {
3451
+ // For FIFO, just need a large enough score to trigger compaction.
3452
+ const double kScoreForNeedCompaction = 1.1;
3453
+ score = kScoreForNeedCompaction;
3328
3454
  }
3329
3455
  } else {
3330
3456
  score = static_cast<double>(num_sorted_runs) /
@@ -3344,7 +3470,7 @@ void VersionStorageInfo::ComputeCompactionScore(
3344
3470
  // When calculating estimated_compaction_needed_bytes, we assume
3345
3471
  // L0 is qualified as pending compactions. We will need to make
3346
3472
  // sure that it qualifies for compaction.
3347
- // It might be guafanteed by logic below anyway, but we are
3473
+ // It might be guaranteed by logic below anyway, but we are
3348
3474
  // explicit here to make sure we don't stop writes with no
3349
3475
  // compaction scheduled.
3350
3476
  score = std::max(score, 1.01);
@@ -3373,7 +3499,7 @@ void VersionStorageInfo::ComputeCompactionScore(
3373
3499
  }
3374
3500
  }
3375
3501
  }
3376
- } else {
3502
+ } else { // level > 0
3377
3503
  // Compute the ratio of current size to size limit.
3378
3504
  uint64_t level_bytes_no_compacting = 0;
3379
3505
  uint64_t level_total_bytes = 0;
@@ -3383,21 +3509,36 @@ void VersionStorageInfo::ComputeCompactionScore(
3383
3509
  level_bytes_no_compacting += f->compensated_file_size;
3384
3510
  }
3385
3511
  }
3386
- if (!immutable_options.level_compaction_dynamic_level_bytes ||
3387
- level_bytes_no_compacting < MaxBytesForLevel(level)) {
3512
+ if (!immutable_options.level_compaction_dynamic_level_bytes) {
3388
3513
  score = static_cast<double>(level_bytes_no_compacting) /
3389
3514
  MaxBytesForLevel(level);
3390
3515
  } else {
3391
- // If there are a large mount of data being compacted down to the
3392
- // current level soon, we would de-prioritize compaction from
3393
- // a level where the incoming data would be a large ratio. We do
3394
- // it by dividing level size not by target level size, but
3395
- // the target size and the incoming compaction bytes.
3396
- score = static_cast<double>(level_bytes_no_compacting) /
3397
- (MaxBytesForLevel(level) + total_downcompact_bytes) *
3398
- kScoreScale;
3516
+ if (level_bytes_no_compacting < MaxBytesForLevel(level)) {
3517
+ score = static_cast<double>(level_bytes_no_compacting) /
3518
+ MaxBytesForLevel(level);
3519
+ } else {
3520
+ // If there are a large mount of data being compacted down to the
3521
+ // current level soon, we would de-prioritize compaction from
3522
+ // a level where the incoming data would be a large ratio. We do
3523
+ // it by dividing level size not by target level size, but
3524
+ // the target size and the incoming compaction bytes.
3525
+ score = static_cast<double>(level_bytes_no_compacting) /
3526
+ (MaxBytesForLevel(level) + total_downcompact_bytes) *
3527
+ kScoreScale;
3528
+ }
3529
+ // Drain unnecessary levels, but with lower priority compared to
3530
+ // when L0 is eligible. Only non-empty levels can be unnecessary.
3531
+ // If there is no unnecessary levels, lowest_unnecessary_level_ = -1.
3532
+ if (level_bytes_no_compacting > 0 &&
3533
+ level <= lowest_unnecessary_level_) {
3534
+ score = std::max(
3535
+ score, kScoreScale *
3536
+ (1.001 + 0.001 * (lowest_unnecessary_level_ - level)));
3537
+ }
3399
3538
  }
3400
- if (level_total_bytes > MaxBytesForLevel(level)) {
3539
+ if (level <= lowest_unnecessary_level_) {
3540
+ total_downcompact_bytes += level_total_bytes;
3541
+ } else if (level_total_bytes > MaxBytesForLevel(level)) {
3401
3542
  total_downcompact_bytes +=
3402
3543
  static_cast<double>(level_total_bytes - MaxBytesForLevel(level));
3403
3544
  }
@@ -4470,6 +4611,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
4470
4611
  }
4471
4612
  }
4472
4613
  } else {
4614
+ assert(ioptions.compaction_style == kCompactionStyleLevel);
4473
4615
  uint64_t max_level_size = 0;
4474
4616
 
4475
4617
  int first_non_empty_level = -1;
@@ -4494,11 +4636,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
4494
4636
  level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
4495
4637
  }
4496
4638
 
4639
+ lowest_unnecessary_level_ = -1;
4497
4640
  if (max_level_size == 0) {
4498
4641
  // No data for L1 and up. L0 compacts to last level directly.
4499
4642
  // No compaction from L1+ needs to be scheduled.
4500
4643
  base_level_ = num_levels_ - 1;
4501
4644
  } else {
4645
+ assert(first_non_empty_level >= 1);
4502
4646
  uint64_t base_bytes_max = options.max_bytes_for_level_base;
4503
4647
  uint64_t base_bytes_min = static_cast<uint64_t>(
4504
4648
  base_bytes_max / options.max_bytes_for_level_multiplier);
@@ -4509,20 +4653,41 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
4509
4653
  // Round up after dividing
4510
4654
  cur_level_size = static_cast<uint64_t>(
4511
4655
  cur_level_size / options.max_bytes_for_level_multiplier);
4656
+ if (lowest_unnecessary_level_ == -1 &&
4657
+ cur_level_size <= base_bytes_min &&
4658
+ (ioptions.preclude_last_level_data_seconds == 0 ||
4659
+ i < num_levels_ - 2)) {
4660
+ // When per_key_placement is enabled, the penultimate level is
4661
+ // necessary.
4662
+ lowest_unnecessary_level_ = i;
4663
+ }
4512
4664
  }
4513
4665
 
4514
4666
  // Calculate base level and its size.
4515
4667
  uint64_t base_level_size;
4516
4668
  if (cur_level_size <= base_bytes_min) {
4669
+ // If per_key_placement is not enabled,
4670
+ // either there is only one non-empty level after level 0,
4671
+ // which can less than base_bytes_min AND necessary,
4672
+ // or there is some unnecessary level.
4673
+ assert(first_non_empty_level == num_levels_ - 1 ||
4674
+ ioptions.preclude_last_level_data_seconds > 0 ||
4675
+ lowest_unnecessary_level_ != -1);
4517
4676
  // Case 1. If we make target size of last level to be max_level_size,
4518
4677
  // target size of the first non-empty level would be smaller than
4519
4678
  // base_bytes_min. We set it be base_bytes_min.
4520
4679
  base_level_size = base_bytes_min + 1U;
4521
4680
  base_level_ = first_non_empty_level;
4522
- ROCKS_LOG_INFO(ioptions.logger,
4523
- "More existing levels in DB than needed. "
4524
- "max_bytes_for_level_multiplier may not be guaranteed.");
4681
+ if (base_level_ < num_levels_ - 1) {
4682
+ ROCKS_LOG_INFO(
4683
+ ioptions.logger,
4684
+ "More existing levels in DB than needed: all non-zero "
4685
+ "levels <= level %d are unnecessary. "
4686
+ "max_bytes_for_level_multiplier may not be guaranteed.",
4687
+ lowest_unnecessary_level_);
4688
+ }
4525
4689
  } else {
4690
+ assert(lowest_unnecessary_level_ == -1);
4526
4691
  // Find base level (where L0 data is compacted to).
4527
4692
  base_level_ = first_non_empty_level;
4528
4693
  while (base_level_ > 1 && cur_level_size > base_bytes_max) {
@@ -4931,7 +5096,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
4931
5096
  Status VersionSet::ProcessManifestWrites(
4932
5097
  std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
4933
5098
  FSDirectory* dir_contains_current_file, bool new_descriptor_log,
4934
- const ColumnFamilyOptions* new_cf_options) {
5099
+ const ColumnFamilyOptions* new_cf_options,
5100
+ const ReadOptions& read_options) {
4935
5101
  mu->AssertHeld();
4936
5102
  assert(!writers.empty());
4937
5103
  ManifestWriter& first_writer = writers.front();
@@ -5162,7 +5328,8 @@ Status VersionSet::ProcessManifestWrites(
5162
5328
  true /* prefetch_index_and_filter_in_cache */,
5163
5329
  false /* is_initial_load */,
5164
5330
  mutable_cf_options_ptrs[i]->prefix_extractor,
5165
- MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
5331
+ MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options,
5332
+ mutable_cf_options_ptrs[i]->block_protection_bytes_per_key);
5166
5333
  if (!s.ok()) {
5167
5334
  if (db_options_->paranoid_checks) {
5168
5335
  break;
@@ -5207,7 +5374,8 @@ Status VersionSet::ProcessManifestWrites(
5207
5374
  constexpr bool update_stats = true;
5208
5375
 
5209
5376
  for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
5210
- versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats);
5377
+ versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], read_options,
5378
+ update_stats);
5211
5379
  }
5212
5380
  }
5213
5381
 
@@ -5319,7 +5487,8 @@ Status VersionSet::ProcessManifestWrites(
5319
5487
  assert(batch_edits.size() == 1);
5320
5488
  assert(new_cf_options != nullptr);
5321
5489
  assert(max_last_sequence == descriptor_last_sequence_);
5322
- CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
5490
+ CreateColumnFamily(*new_cf_options, read_options,
5491
+ first_writer.edit_list.front());
5323
5492
  } else if (first_writer.edit_list.front()->is_column_family_drop_) {
5324
5493
  assert(batch_edits.size() == 1);
5325
5494
  assert(max_last_sequence == descriptor_last_sequence_);
@@ -5488,6 +5657,7 @@ void VersionSet::WakeUpWaitingManifestWriters() {
5488
5657
  Status VersionSet::LogAndApply(
5489
5658
  const autovector<ColumnFamilyData*>& column_family_datas,
5490
5659
  const autovector<const MutableCFOptions*>& mutable_cf_options_list,
5660
+ const ReadOptions& read_options,
5491
5661
  const autovector<autovector<VersionEdit*>>& edit_lists,
5492
5662
  InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
5493
5663
  bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options,
@@ -5565,7 +5735,8 @@ Status VersionSet::LogAndApply(
5565
5735
  return Status::ColumnFamilyDropped();
5566
5736
  }
5567
5737
  return ProcessManifestWrites(writers, mu, dir_contains_current_file,
5568
- new_descriptor_log, new_cf_options);
5738
+ new_descriptor_log, new_cf_options,
5739
+ read_options);
5569
5740
  }
5570
5741
 
5571
5742
  void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
@@ -5649,6 +5820,7 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
5649
5820
  Status VersionSet::Recover(
5650
5821
  const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
5651
5822
  std::string* db_id, bool no_error_if_files_missing) {
5823
+ const ReadOptions read_options(Env::IOActivity::kDBOpen);
5652
5824
  // Read "CURRENT" file, which contains a pointer to the current manifest
5653
5825
  // file
5654
5826
  std::string manifest_path;
@@ -5685,7 +5857,7 @@ Status VersionSet::Recover(
5685
5857
  VersionEditHandler handler(
5686
5858
  read_only, column_families, const_cast<VersionSet*>(this),
5687
5859
  /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_,
5688
- EpochNumberRequirement::kMightMissing);
5860
+ read_options, EpochNumberRequirement::kMightMissing);
5689
5861
  handler.Iterate(reader, &log_read_status);
5690
5862
  s = handler.status();
5691
5863
  if (s.ok()) {
@@ -5833,6 +6005,7 @@ Status VersionSet::TryRecoverFromOneManifest(
5833
6005
  const std::string& manifest_path,
5834
6006
  const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
5835
6007
  std::string* db_id, bool* has_missing_table_file) {
6008
+ const ReadOptions read_options(Env::IOActivity::kDBOpen);
5836
6009
  ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n",
5837
6010
  manifest_path.c_str());
5838
6011
  std::unique_ptr<SequentialFileReader> manifest_file_reader;
@@ -5857,7 +6030,7 @@ Status VersionSet::TryRecoverFromOneManifest(
5857
6030
  /*checksum=*/true, /*log_num=*/0);
5858
6031
  VersionEditHandlerPointInTime handler_pit(
5859
6032
  read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
5860
- EpochNumberRequirement::kMightMissing);
6033
+ read_options, EpochNumberRequirement::kMightMissing);
5861
6034
 
5862
6035
  handler_pit.Iterate(reader, &s);
5863
6036
 
@@ -5900,6 +6073,8 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
5900
6073
  Status VersionSet::ListColumnFamiliesFromManifest(
5901
6074
  const std::string& manifest_path, FileSystem* fs,
5902
6075
  std::vector<std::string>* column_families) {
6076
+ // TODO: plumb Env::IOActivity
6077
+ const ReadOptions read_options;
5903
6078
  std::unique_ptr<SequentialFileReader> file_reader;
5904
6079
  Status s;
5905
6080
  {
@@ -5919,7 +6094,7 @@ Status VersionSet::ListColumnFamiliesFromManifest(
5919
6094
  log::Reader reader(nullptr, std::move(file_reader), &reporter,
5920
6095
  true /* checksum */, 0 /* log_number */);
5921
6096
 
5922
- ListColumnFamiliesHandler handler;
6097
+ ListColumnFamiliesHandler handler(read_options);
5923
6098
  handler.Iterate(reader, &s);
5924
6099
 
5925
6100
  assert(column_families);
@@ -5942,6 +6117,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
5942
6117
  "Number of levels needs to be bigger than 1");
5943
6118
  }
5944
6119
 
6120
+ // TODO: plumb Env::IOActivity
6121
+ const ReadOptions read_options;
6122
+
5945
6123
  ImmutableDBOptions db_options(*options);
5946
6124
  ColumnFamilyOptions cf_options(*options);
5947
6125
  std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
@@ -6029,8 +6207,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
6029
6207
  InstrumentedMutex dummy_mutex;
6030
6208
  InstrumentedMutexLock l(&dummy_mutex);
6031
6209
  return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(),
6032
- mutable_cf_options, &ve, &dummy_mutex, nullptr,
6033
- true);
6210
+ mutable_cf_options, read_options, &ve,
6211
+ &dummy_mutex, nullptr, true);
6034
6212
  }
6035
6213
 
6036
6214
  // Get the checksum information including the checksum and checksum function
@@ -6103,6 +6281,9 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
6103
6281
  Status VersionSet::DumpManifest(Options& options, std::string& dscname,
6104
6282
  bool verbose, bool hex, bool json) {
6105
6283
  assert(options.env);
6284
+ // TODO: plumb Env::IOActivity
6285
+ const ReadOptions read_options;
6286
+
6106
6287
  std::vector<std::string> column_families;
6107
6288
  Status s = ListColumnFamiliesFromManifest(
6108
6289
  dscname, options.env->GetFileSystem().get(), &column_families);
@@ -6129,7 +6310,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
6129
6310
  cf_descs.emplace_back(cf, options);
6130
6311
  }
6131
6312
 
6132
- DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json);
6313
+ DumpManifestHandler handler(cf_descs, this, io_tracer_, read_options, verbose,
6314
+ hex, json);
6133
6315
  {
6134
6316
  VersionSet::LogReporter reporter;
6135
6317
  reporter.status = &s;
@@ -6267,7 +6449,7 @@ Status VersionSet::WriteCurrentStateToManifest(
6267
6449
  f->oldest_blob_file_number, f->oldest_ancester_time,
6268
6450
  f->file_creation_time, f->epoch_number, f->file_checksum,
6269
6451
  f->file_checksum_func_name, f->unique_id,
6270
- f->compensated_range_deletion_size);
6452
+ f->compensated_range_deletion_size, f->tail_size);
6271
6453
  }
6272
6454
  }
6273
6455
 
@@ -6332,6 +6514,7 @@ Status VersionSet::WriteCurrentStateToManifest(
6332
6514
  // we avoid doing binary search for the keys b and c twice and instead somehow
6333
6515
  // maintain state of where they first appear in the files.
6334
6516
  uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
6517
+ const ReadOptions& read_options,
6335
6518
  Version* v, const Slice& start,
6336
6519
  const Slice& end, int start_level,
6337
6520
  int end_level, TableReaderCaller caller) {
@@ -6411,8 +6594,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
6411
6594
  for (int i = idx_start + 1; i < idx_end; ++i) {
6412
6595
  uint64_t file_size = files_brief.files[i].fd.GetFileSize();
6413
6596
  // The entire file falls into the range, so we can just take its size.
6414
- assert(file_size ==
6415
- ApproximateSize(v, files_brief.files[i], start, end, caller));
6597
+ assert(file_size == ApproximateSize(read_options, v, files_brief.files[i],
6598
+ start, end, caller));
6416
6599
  total_full_size += file_size;
6417
6600
  }
6418
6601
 
@@ -6447,21 +6630,24 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
6447
6630
  // Estimate for all the first files (might also be last files), at each
6448
6631
  // level
6449
6632
  for (const auto file_ptr : first_files) {
6450
- total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
6633
+ total_full_size +=
6634
+ ApproximateSize(read_options, v, *file_ptr, start, end, caller);
6451
6635
  }
6452
6636
 
6453
6637
  // Estimate for all the last files, at each level
6454
6638
  for (const auto file_ptr : last_files) {
6455
6639
  // We could use ApproximateSize here, but calling ApproximateOffsetOf
6456
6640
  // directly is just more efficient.
6457
- total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
6641
+ total_full_size +=
6642
+ ApproximateOffsetOf(read_options, v, *file_ptr, end, caller);
6458
6643
  }
6459
6644
  }
6460
6645
 
6461
6646
  return total_full_size;
6462
6647
  }
6463
6648
 
6464
- uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
6649
+ uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options,
6650
+ Version* v, const FdWithKeyRange& f,
6465
6651
  const Slice& key,
6466
6652
  TableReaderCaller caller) {
6467
6653
  // pre-condition
@@ -6479,16 +6665,18 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
6479
6665
  // "key" falls in the range for this table. Add the
6480
6666
  // approximate offset of "key" within the table.
6481
6667
  TableCache* table_cache = v->cfd_->table_cache();
6668
+ const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
6482
6669
  if (table_cache != nullptr) {
6483
6670
  result = table_cache->ApproximateOffsetOf(
6484
- key, *f.file_metadata, caller, icmp,
6485
- v->GetMutableCFOptions().prefix_extractor);
6671
+ read_options, key, *f.file_metadata, caller, icmp,
6672
+ cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor);
6486
6673
  }
6487
6674
  }
6488
6675
  return result;
6489
6676
  }
6490
6677
 
6491
- uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
6678
+ uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options,
6679
+ Version* v, const FdWithKeyRange& f,
6492
6680
  const Slice& start, const Slice& end,
6493
6681
  TableReaderCaller caller) {
6494
6682
  // pre-condition
@@ -6504,13 +6692,14 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
6504
6692
 
6505
6693
  if (icmp.Compare(f.smallest_key, start) >= 0) {
6506
6694
  // Start of the range is before the file start - approximate by end offset
6507
- return ApproximateOffsetOf(v, f, end, caller);
6695
+ return ApproximateOffsetOf(read_options, v, f, end, caller);
6508
6696
  }
6509
6697
 
6510
6698
  if (icmp.Compare(f.largest_key, end) < 0) {
6511
6699
  // End of the range is after the file end - approximate by subtracting
6512
6700
  // start offset from the file size
6513
- uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
6701
+ uint64_t start_offset =
6702
+ ApproximateOffsetOf(read_options, v, f, start, caller);
6514
6703
  assert(f.fd.GetFileSize() >= start_offset);
6515
6704
  return f.fd.GetFileSize() - start_offset;
6516
6705
  }
@@ -6520,9 +6709,10 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
6520
6709
  if (table_cache == nullptr) {
6521
6710
  return 0;
6522
6711
  }
6712
+ const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
6523
6713
  return table_cache->ApproximateSize(
6524
- start, end, *f.file_metadata, caller, icmp,
6525
- v->GetMutableCFOptions().prefix_extractor);
6714
+ read_options, start, end, *f.file_metadata, caller, icmp,
6715
+ cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor);
6526
6716
  }
6527
6717
 
6528
6718
  void VersionSet::RemoveLiveFiles(
@@ -6681,6 +6871,7 @@ InternalIterator* VersionSet::MakeInputIterator(
6681
6871
  /*smallest_compaction_key=*/nullptr,
6682
6872
  /*largest_compaction_key=*/nullptr,
6683
6873
  /*allow_unprepared_value=*/false,
6874
+ c->mutable_cf_options()->block_protection_bytes_per_key,
6684
6875
  /*range_del_iter=*/&range_tombstone_iter);
6685
6876
  range_tombstones.emplace_back(range_tombstone_iter, nullptr);
6686
6877
  }
@@ -6694,8 +6885,9 @@ InternalIterator* VersionSet::MakeInputIterator(
6694
6885
  /*should_sample=*/false,
6695
6886
  /*no per level latency histogram=*/nullptr,
6696
6887
  TableReaderCaller::kCompaction, /*skip_filters=*/false,
6697
- /*level=*/static_cast<int>(c->level(which)), range_del_agg,
6698
- c->boundaries(which), false, &tombstone_iter_ptr);
6888
+ /*level=*/static_cast<int>(c->level(which)),
6889
+ c->mutable_cf_options()->block_protection_bytes_per_key,
6890
+ range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr);
6699
6891
  range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
6700
6892
  }
6701
6893
  }
@@ -6812,7 +7004,8 @@ void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
6812
7004
  }
6813
7005
 
6814
7006
  ColumnFamilyData* VersionSet::CreateColumnFamily(
6815
- const ColumnFamilyOptions& cf_options, const VersionEdit* edit) {
7007
+ const ColumnFamilyOptions& cf_options, const ReadOptions& read_options,
7008
+ const VersionEdit* edit) {
6816
7009
  assert(edit->is_column_family_add_);
6817
7010
 
6818
7011
  MutableCFOptions dummy_cf_options;
@@ -6831,7 +7024,8 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
6831
7024
 
6832
7025
  constexpr bool update_stats = false;
6833
7026
 
6834
- v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats);
7027
+ v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), read_options,
7028
+ update_stats);
6835
7029
 
6836
7030
  AppendVersion(new_cfd, v);
6837
7031
  // GetLatestMutableCFOptions() is safe here without mutex since the
@@ -6896,7 +7090,8 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
6896
7090
  return all_versions_blob_file_size;
6897
7091
  }
6898
7092
 
6899
- Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
7093
+ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
7094
+ ColumnFamilyData* cfd,
6900
7095
  const std::string& fpath, int level,
6901
7096
  const FileMetaData& meta) {
6902
7097
  uint64_t fsize = 0;
@@ -6929,7 +7124,8 @@ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
6929
7124
  TableCache::TypedHandle* handle = nullptr;
6930
7125
  FileMetaData meta_copy = meta;
6931
7126
  status = table_cache->FindTable(
6932
- ReadOptions(), file_opts, *icmp, meta_copy, &handle, pe,
7127
+ read_options, file_opts, *icmp, meta_copy, &handle,
7128
+ cf_opts->block_protection_bytes_per_key, pe,
6933
7129
  /*no_io=*/false, /*record_read_stats=*/true,
6934
7130
  internal_stats->GetFileReadHist(level), false, level,
6935
7131
  /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
@@ -6973,9 +7169,9 @@ Status ReactiveVersionSet::Recover(
6973
7169
  log::Reader* reader = manifest_reader->get();
6974
7170
  assert(reader);
6975
7171
 
6976
- manifest_tailer_.reset(
6977
- new ManifestTailer(column_families, const_cast<ReactiveVersionSet*>(this),
6978
- io_tracer_, EpochNumberRequirement::kMightMissing));
7172
+ manifest_tailer_.reset(new ManifestTailer(
7173
+ column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_,
7174
+ read_options_, EpochNumberRequirement::kMightMissing));
6979
7175
 
6980
7176
  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
6981
7177