@nxtedition/rocksdb 10.1.5 → 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. package/binding.cc +19 -11
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +16 -5
  3. package/deps/rocksdb/rocksdb/Makefile +38 -15
  4. package/deps/rocksdb/rocksdb/TARGETS +10 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +58 -0
  6. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +4 -4
  7. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.h +4 -2
  8. package/deps/rocksdb/rocksdb/db/builder.cc +2 -2
  9. package/deps/rocksdb/rocksdb/db/builder.h +1 -1
  10. package/deps/rocksdb/rocksdb/db/c.cc +205 -6
  11. package/deps/rocksdb/rocksdb/db/c_test.c +189 -1
  12. package/deps/rocksdb/rocksdb/db/column_family.cc +28 -0
  13. package/deps/rocksdb/rocksdb/db/column_family.h +17 -0
  14. package/deps/rocksdb/rocksdb/db/column_family_test.cc +234 -60
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +8 -1
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +11 -9
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +4 -4
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -0
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +1 -0
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +22 -25
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +2 -0
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +112 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +72 -21
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +2 -0
  25. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +77 -0
  26. package/deps/rocksdb/rocksdb/db/convenience.cc +3 -0
  27. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +269 -112
  28. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +107 -43
  29. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +93 -24
  30. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +5 -5
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +157 -68
  32. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +56 -15
  33. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +78 -105
  34. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +39 -9
  35. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  36. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +21 -14
  37. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +107 -63
  38. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +43 -2
  39. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +4 -0
  40. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +6 -0
  41. package/deps/rocksdb/rocksdb/db/db_test.cc +10 -2
  42. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  43. package/deps/rocksdb/rocksdb/db/db_test_util.cc +5 -0
  44. package/deps/rocksdb/rocksdb/db/db_test_util.h +7 -6
  45. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +92 -2
  46. package/deps/rocksdb/rocksdb/db/error_handler.cc +34 -39
  47. package/deps/rocksdb/rocksdb/db/error_handler.h +3 -4
  48. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +8 -4
  49. package/deps/rocksdb/rocksdb/db/event_helpers.cc +6 -3
  50. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +71 -15
  51. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h +11 -0
  52. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +383 -4
  53. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +88 -72
  54. package/deps/rocksdb/rocksdb/db/flush_job.cc +30 -3
  55. package/deps/rocksdb/rocksdb/db/flush_job.h +14 -0
  56. package/deps/rocksdb/rocksdb/db/internal_stats.cc +60 -1
  57. package/deps/rocksdb/rocksdb/db/internal_stats.h +20 -1
  58. package/deps/rocksdb/rocksdb/db/log_writer.cc +24 -0
  59. package/deps/rocksdb/rocksdb/db/log_writer.h +5 -0
  60. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  61. package/deps/rocksdb/rocksdb/db/memtable.h +10 -10
  62. package/deps/rocksdb/rocksdb/db/memtable_list.cc +4 -4
  63. package/deps/rocksdb/rocksdb/db/multi_cf_iterator_impl.h +10 -3
  64. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +8 -10
  65. package/deps/rocksdb/rocksdb/db/repair.cc +4 -3
  66. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +30 -0
  67. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +9 -0
  68. package/deps/rocksdb/rocksdb/db/table_cache.cc +17 -2
  69. package/deps/rocksdb/rocksdb/db/table_cache.h +9 -1
  70. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +9 -2
  71. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +3 -1
  72. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +3 -3
  73. package/deps/rocksdb/rocksdb/db/transaction_log_impl.h +7 -7
  74. package/deps/rocksdb/rocksdb/db/version_edit.cc +0 -1
  75. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -6
  76. package/deps/rocksdb/rocksdb/db/version_set.cc +54 -31
  77. package/deps/rocksdb/rocksdb/db/version_set.h +14 -7
  78. package/deps/rocksdb/rocksdb/db/wal_manager.cc +37 -29
  79. package/deps/rocksdb/rocksdb/db/wal_manager.h +6 -5
  80. package/deps/rocksdb/rocksdb/db/wide/wide_columns_helper.cc +6 -0
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +54 -23
  82. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +46 -5
  83. package/deps/rocksdb/rocksdb/db/write_thread.cc +53 -5
  84. package/deps/rocksdb/rocksdb/db/write_thread.h +36 -4
  85. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  86. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +5 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +57 -17
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +11 -3
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +8 -4
  90. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +10 -25
  91. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +25 -88
  92. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_filters.cc +93 -0
  93. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_filters.h +16 -0
  94. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +43 -0
  95. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +109 -21
  96. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +8 -0
  97. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +666 -205
  98. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +55 -10
  99. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +18 -16
  100. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +19 -0
  101. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +5 -0
  102. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +782 -494
  103. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +21 -0
  104. package/deps/rocksdb/rocksdb/env/env.cc +6 -0
  105. package/deps/rocksdb/rocksdb/env/io_posix.cc +0 -1
  106. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  107. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +34 -19
  108. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +29 -32
  109. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +41 -15
  110. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +4 -2
  111. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +63 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +16 -5
  113. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +5 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/iterator.h +0 -16
  115. package/deps/rocksdb/rocksdb/include/rocksdb/iterator_base.h +16 -0
  116. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +21 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +76 -3
  118. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +17 -0
  119. package/deps/rocksdb/rocksdb/include/rocksdb/transaction_log.h +12 -6
  120. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +31 -0
  121. package/deps/rocksdb/rocksdb/include/rocksdb/user_write_callback.h +29 -0
  122. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h +4 -2
  123. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h +0 -1
  124. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +17 -8
  125. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +2 -2
  126. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +46 -0
  127. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +7 -0
  128. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  129. package/deps/rocksdb/rocksdb/options/cf_options.cc +13 -2
  130. package/deps/rocksdb/rocksdb/options/cf_options.h +6 -2
  131. package/deps/rocksdb/rocksdb/options/db_options.cc +8 -0
  132. package/deps/rocksdb/rocksdb/options/db_options.h +9 -5
  133. package/deps/rocksdb/rocksdb/options/options.cc +3 -0
  134. package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
  135. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +3 -1
  136. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +2 -2
  137. package/deps/rocksdb/rocksdb/port/stack_trace.cc +1 -0
  138. package/deps/rocksdb/rocksdb/port/win/port_win.cc +3 -2
  139. package/deps/rocksdb/rocksdb/src.mk +4 -0
  140. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +1 -2
  141. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +4 -2
  142. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +15 -0
  143. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +102 -41
  144. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +15 -7
  145. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -3
  146. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -6
  147. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +31 -0
  148. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +6 -0
  149. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +10 -5
  150. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +11 -15
  151. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +17 -11
  152. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +5 -2
  153. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +28 -21
  154. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +9 -11
  155. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +16 -16
  156. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +1 -2
  157. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +14 -9
  158. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +4 -1
  159. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +82 -41
  160. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +13 -14
  161. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +18 -22
  162. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +51 -13
  163. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +2 -0
  164. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +3 -11
  165. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +2 -3
  166. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +9 -10
  167. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +3 -2
  168. package/deps/rocksdb/rocksdb/table/format.cc +1 -2
  169. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +18 -13
  170. package/deps/rocksdb/rocksdb/table/merging_iterator.h +5 -3
  171. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +2 -2
  172. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +1 -1
  173. package/deps/rocksdb/rocksdb/table/sst_file_writer_collectors.h +3 -1
  174. package/deps/rocksdb/rocksdb/table/table_builder.h +8 -7
  175. package/deps/rocksdb/rocksdb/table/table_reader.h +9 -0
  176. package/deps/rocksdb/rocksdb/test_util/testutil.cc +1 -0
  177. package/deps/rocksdb/rocksdb/test_util/testutil.h +6 -0
  178. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +19 -0
  179. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +434 -110
  180. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -1
  181. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +3 -0
  182. package/deps/rocksdb/rocksdb/util/aligned_storage.h +24 -0
  183. package/deps/rocksdb/rocksdb/util/filter_bench.cc +1 -1
  184. package/deps/rocksdb/rocksdb/util/random.cc +2 -1
  185. package/deps/rocksdb/rocksdb/util/stderr_logger.h +1 -1
  186. package/deps/rocksdb/rocksdb/util/udt_util.cc +33 -0
  187. package/deps/rocksdb/rocksdb/util/udt_util.h +7 -0
  188. package/deps/rocksdb/rocksdb/util/udt_util_test.cc +33 -0
  189. package/deps/rocksdb/rocksdb/util/write_batch_util.h +5 -0
  190. package/deps/rocksdb/rocksdb/util/xxhash.h +10 -3
  191. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +13 -13
  192. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +104 -48
  193. package/deps/rocksdb/rocksdb/utilities/debug.cc +16 -4
  194. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +647 -235
  195. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -157
  196. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +144 -0
  197. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +45 -0
  198. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector_test.cc +139 -0
  199. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +12 -0
  200. package/deps/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +3 -0
  201. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +105 -6
  202. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +64 -8
  203. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -0
  204. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +43 -5
  205. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +5 -0
  206. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +154 -6
  207. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +1 -1
  208. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +158 -2
  209. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +16 -11
  210. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +4 -4
  211. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +9 -8
  212. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +2 -1
  213. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +43 -7
  214. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +2 -0
  215. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +1 -1
  216. package/package.json +1 -1
  217. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  218. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  219. package/.tap/test-results/node_modules/abstract-level/test/chained-batch-test.js.tap +0 -0
  220. package/.tap/test-results/node_modules/abstract-level/test/get-test.js.tap +0 -0
  221. package/.tap/test-results/test/abstract-level-test.js.tap +0 -1077
  222. package/.tap/test-results/test/batch-test.js.tap +0 -12
  223. package/.tap/test-results/test/chained-batch-gc-test.js.tap +0 -11
  224. package/.tap/test-results/test/cleanup-hanging-iterators-test.js.tap +0 -135
  225. package/.tap/test-results/test/clear-gc-test.js.tap +0 -13
  226. package/.tap/test-results/test/column-test.js.tap +0 -55
  227. package/.tap/test-results/test/common.js.tap +0 -0
  228. package/.tap/test-results/test/compression-test.js.tap +0 -30
  229. package/.tap/test-results/test/db-identity.js.tap +0 -12
  230. package/.tap/test-results/test/electron.js.tap +0 -0
  231. package/.tap/test-results/test/env-cleanup-hook-test.js.tap +0 -40
  232. package/.tap/test-results/test/env-cleanup-hook.js.tap +0 -0
  233. package/.tap/test-results/test/gc.js.tap +0 -0
  234. package/.tap/test-results/test/getproperty-test.js.tap +0 -29
  235. package/.tap/test-results/test/iterator-gc-test.js.tap +0 -15
  236. package/.tap/test-results/test/iterator-hwm-test.js.tap +0 -131
  237. package/.tap/test-results/test/iterator-recursion-test.js.tap +0 -12
  238. package/.tap/test-results/test/iterator-starvation-test.js.tap +0 -73
  239. package/.tap/test-results/test/iterator-test.js.tap +0 -6
  240. package/.tap/test-results/test/leak-tester-batch.js.tap +0 -0
  241. package/.tap/test-results/test/leak-tester-iterator.js.tap +0 -0
  242. package/.tap/test-results/test/leak-tester.js.tap +0 -0
  243. package/.tap/test-results/test/lock-test.js.tap +0 -18
  244. package/.tap/test-results/test/lock.js.tap +0 -0
  245. package/.tap/test-results/test/make.js.tap +0 -0
  246. package/.tap/test-results/test/max-rev-merge.js.tap +0 -0
  247. package/.tap/test-results/test/merge-operator-test.js.tap +0 -12
  248. package/.tap/test-results/test/mkdir-test.js.tap +0 -15
  249. package/.tap/test-results/test/segfault-test.js.tap +0 -76
  250. package/.tap/test-results/test/stack-blower.js.tap +0 -0
  251. package/deps/rocksdb/rocksdb/README.md +0 -29
  252. package/deps/rocksdb/rocksdb/microbench/README.md +0 -60
  253. package/deps/rocksdb/rocksdb/plugin/README.md +0 -43
  254. package/deps/rocksdb/rocksdb/port/README +0 -10
  255. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README +0 -13
@@ -173,7 +173,8 @@ struct MutableCFOptions {
173
173
  compression_per_level(options.compression_per_level),
174
174
  memtable_max_range_deletions(options.memtable_max_range_deletions),
175
175
  bottommost_file_compaction_delay(
176
- options.bottommost_file_compaction_delay) {
176
+ options.bottommost_file_compaction_delay),
177
+ uncache_aggressiveness(options.uncache_aggressiveness) {
177
178
  RefreshDerivedOptions(options.num_levels, options.compaction_style);
178
179
  }
179
180
 
@@ -223,7 +224,9 @@ struct MutableCFOptions {
223
224
  memtable_protection_bytes_per_key(0),
224
225
  block_protection_bytes_per_key(0),
225
226
  sample_for_compression(0),
226
- memtable_max_range_deletions(0) {}
227
+ memtable_max_range_deletions(0),
228
+ bottommost_file_compaction_delay(0),
229
+ uncache_aggressiveness(0) {}
227
230
 
228
231
  explicit MutableCFOptions(const Options& options);
229
232
 
@@ -319,6 +322,7 @@ struct MutableCFOptions {
319
322
  std::vector<CompressionType> compression_per_level;
320
323
  uint32_t memtable_max_range_deletions;
321
324
  uint32_t bottommost_file_compaction_delay;
325
+ uint32_t uncache_aggressiveness;
322
326
 
323
327
  // Derived options
324
328
  // Per-level target file size.
@@ -388,6 +388,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
388
388
  {offsetof(struct ImmutableDBOptions, wal_compression),
389
389
  OptionType::kCompressionType, OptionVerificationType::kNormal,
390
390
  OptionTypeFlags::kNone}},
391
+ {"background_close_inactive_wals",
392
+ {offsetof(struct ImmutableDBOptions, background_close_inactive_wals),
393
+ OptionType::kBoolean, OptionVerificationType::kNormal,
394
+ OptionTypeFlags::kNone}},
391
395
  {"seq_per_batch",
392
396
  {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
393
397
  OptionTypeFlags::kNone}},
@@ -755,6 +759,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
755
759
  two_write_queues(options.two_write_queues),
756
760
  manual_wal_flush(options.manual_wal_flush),
757
761
  wal_compression(options.wal_compression),
762
+ background_close_inactive_wals(options.background_close_inactive_wals),
758
763
  atomic_flush(options.atomic_flush),
759
764
  avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
760
765
  persist_stats_to_disk(options.persist_stats_to_disk),
@@ -921,6 +926,9 @@ void ImmutableDBOptions::Dump(Logger* log) const {
921
926
  manual_wal_flush);
922
927
  ROCKS_LOG_HEADER(log, " Options.wal_compression: %d",
923
928
  wal_compression);
929
+ ROCKS_LOG_HEADER(log,
930
+ " Options.background_close_inactive_wals: %d",
931
+ background_close_inactive_wals);
924
932
  ROCKS_LOG_HEADER(log, " Options.atomic_flush: %d", atomic_flush);
925
933
  ROCKS_LOG_HEADER(log,
926
934
  " Options.avoid_unnecessary_blocking_io: %d",
@@ -84,6 +84,7 @@ struct ImmutableDBOptions {
84
84
  bool two_write_queues;
85
85
  bool manual_wal_flush;
86
86
  CompressionType wal_compression;
87
+ bool background_close_inactive_wals;
87
88
  bool atomic_flush;
88
89
  bool avoid_unnecessary_blocking_io;
89
90
  bool persist_stats_to_disk;
@@ -97,17 +98,20 @@ struct ImmutableDBOptions {
97
98
  std::string db_host_id;
98
99
  FileTypeSet checksum_handoff_file_types;
99
100
  CacheTier lowest_used_cache_tier;
100
- // Convenience/Helper objects that are not part of the base DBOptions
101
- std::shared_ptr<FileSystem> fs;
102
- SystemClock* clock;
103
- Statistics* stats;
104
- Logger* logger;
105
101
  std::shared_ptr<CompactionService> compaction_service;
106
102
  bool enforce_single_del_contracts;
107
103
  uint64_t follower_refresh_catchup_period_ms;
108
104
  uint64_t follower_catchup_retry_count;
109
105
  uint64_t follower_catchup_retry_wait_ms;
110
106
 
107
+ // Beginning convenience/helper objects that are not part of the base
108
+ // DBOptions
109
+ std::shared_ptr<FileSystem> fs;
110
+ SystemClock* clock;
111
+ Statistics* stats;
112
+ Logger* logger;
113
+ // End of convenience/helper objects.
114
+
111
115
  bool IsWalDirSameAsDBPath() const;
112
116
  bool IsWalDirSameAsDBPath(const std::string& path) const;
113
117
  const std::string& GetWalDir() const;
@@ -360,6 +360,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
360
360
  ROCKS_LOG_HEADER(log,
361
361
  "Options.compaction_options_universal.stop_style: %s",
362
362
  str_compaction_stop_style.c_str());
363
+ ROCKS_LOG_HEADER(log,
364
+ "Options.compaction_options_universal.max_read_amp: %d",
365
+ compaction_options_universal.max_read_amp);
363
366
  ROCKS_LOG_HEADER(
364
367
  log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
365
368
  compaction_options_fifo.max_table_files_size);
@@ -274,6 +274,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
274
274
  cf_opts->last_level_temperature = moptions.last_level_temperature;
275
275
  cf_opts->default_write_temperature = moptions.default_write_temperature;
276
276
  cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions;
277
+ cf_opts->uncache_aggressiveness = moptions.uncache_aggressiveness;
277
278
  }
278
279
 
279
280
  void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
@@ -353,6 +353,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
353
353
  "two_write_queues=false;"
354
354
  "manual_wal_flush=false;"
355
355
  "wal_compression=kZSTD;"
356
+ "background_close_inactive_wals=true;"
356
357
  "seq_per_batch=false;"
357
358
  "atomic_flush=false;"
358
359
  "avoid_unnecessary_blocking_io=false;"
@@ -565,7 +566,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
565
566
  "persist_user_defined_timestamps=true;"
566
567
  "block_protection_bytes_per_key=1;"
567
568
  "memtable_max_range_deletions=999999;"
568
- "bottommost_file_compaction_delay=7200;",
569
+ "bottommost_file_compaction_delay=7200;"
570
+ "uncache_aggressiveness=1234;",
569
571
  new_options));
570
572
 
571
573
  ASSERT_NE(new_options->blob_cache.get(), nullptr);
@@ -6,7 +6,7 @@
6
6
  #pragma once
7
7
 
8
8
  #if defined(__clang__) && defined(__GLIBC__)
9
- // glibc's `posix_memalign()` declaration specifies `throw()` while clang's
9
+ // glibc's `posix_memalign()` declaration specifies `noexcept` while clang's
10
10
  // declaration does not. There is a hack in clang to make its re-declaration
11
11
  // compatible with glibc's if they are declared consecutively. That hack breaks
12
12
  // if yet another `posix_memalign()` declaration comes between glibc's and
@@ -14,7 +14,7 @@
14
14
  // declarations both come before "jemalloc.h"'s `posix_memalign()` declaration.
15
15
  //
16
16
  // This problem could also be avoided if "jemalloc.h"'s `posix_memalign()`
17
- // declaration did not specify `throw()` when built with clang.
17
+ // declaration did not specify `noexcept` when built with clang.
18
18
  #include <mm_malloc.h>
19
19
  #endif
20
20
 
@@ -39,6 +39,7 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
39
39
  #endif // OS_OPENBSD
40
40
  #ifdef OS_FREEBSD
41
41
  #include <sys/sysctl.h>
42
+ #include <sys/wait.h>
42
43
  #endif // OS_FREEBSD
43
44
  #ifdef OS_LINUX
44
45
  #include <sys/prctl.h>
@@ -101,8 +101,9 @@ bool CondVar::TimedWait(uint64_t abs_time_us) {
101
101
  std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
102
102
 
103
103
  // Work around https://github.com/microsoft/STL/issues/369
104
- #if defined(_MSC_VER) && \
105
- (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L)
104
+ // std::condition_variable_any::wait_for had a fix, but
105
+ // std::condition_variable still doesn't have a fix in STL yet
106
+ #if defined(_MSC_VER)
106
107
  if (relTimeUs == std::chrono::microseconds::zero()) {
107
108
  lk.unlock();
108
109
  lk.lock();
@@ -304,6 +304,7 @@ LIB_SOURCES = \
304
304
  utilities/persistent_cache/volatile_tier_impl.cc \
305
305
  utilities/simulator_cache/cache_simulator.cc \
306
306
  utilities/simulator_cache/sim_cache.cc \
307
+ utilities/table_properties_collectors/compact_for_tiering_collector.cc \
307
308
  utilities/table_properties_collectors/compact_on_deletion_collector.cc \
308
309
  utilities/trace/file_trace_reader_writer.cc \
309
310
  utilities/trace/replayer_impl.cc \
@@ -380,6 +381,7 @@ STRESS_LIB_SOURCES = \
380
381
  db_stress_tool/cf_consistency_stress.cc \
381
382
  db_stress_tool/db_stress_common.cc \
382
383
  db_stress_tool/db_stress_driver.cc \
384
+ db_stress_tool/db_stress_filters.cc \
383
385
  db_stress_tool/db_stress_gflags.cc \
384
386
  db_stress_tool/db_stress_listener.cc \
385
387
  db_stress_tool/db_stress_shared_state.cc \
@@ -405,6 +407,7 @@ TEST_LIB_SOURCES = \
405
407
  FOLLY_SOURCES = \
406
408
  $(FOLLY_DIR)/folly/container/detail/F14Table.cpp \
407
409
  $(FOLLY_DIR)/folly/detail/Futex.cpp \
410
+ $(FOLLY_DIR)/folly/lang/Exception.cpp \
408
411
  $(FOLLY_DIR)/folly/lang/SafeAssert.cpp \
409
412
  $(FOLLY_DIR)/folly/lang/ToAscii.cpp \
410
413
  $(FOLLY_DIR)/folly/ScopeGuard.cpp \
@@ -628,6 +631,7 @@ TEST_MAIN_SOURCES = \
628
631
  utilities/persistent_cache/persistent_cache_test.cc \
629
632
  utilities/simulator_cache/cache_simulator_test.cc \
630
633
  utilities/simulator_cache/sim_cache_test.cc \
634
+ utilities/table_properties_collectors/compact_for_tiering_collector_test.cc \
631
635
  utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \
632
636
  utilities/transactions/optimistic_transaction_test.cc \
633
637
  utilities/transactions/lock/range/range_locking_test.cc \
@@ -44,9 +44,8 @@ InternalIteratorBase<IndexValue>* BinarySearchIndexReader::NewIterator(
44
44
  IndexBlockIter* iter, GetContext* get_context,
45
45
  BlockCacheLookupContext* lookup_context) {
46
46
  const BlockBasedTable::Rep* rep = table()->get_rep();
47
- const bool no_io = (read_options.read_tier == kBlockCacheTier);
48
47
  CachableEntry<Block> index_block;
49
- const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context,
48
+ const Status s = GetOrReadIndexBlock(get_context, lookup_context,
50
49
  &index_block, read_options);
51
50
  if (!s.ok()) {
52
51
  if (iter != nullptr) {
@@ -582,8 +582,10 @@ struct BlockBasedTableBuilder::Rep {
582
582
  assert(factory);
583
583
 
584
584
  std::unique_ptr<InternalTblPropColl> collector{
585
- factory->CreateInternalTblPropColl(tbo.column_family_id,
586
- tbo.level_at_creation)};
585
+ factory->CreateInternalTblPropColl(
586
+ tbo.column_family_id, tbo.level_at_creation,
587
+ tbo.ioptions.num_levels,
588
+ tbo.last_level_inclusive_max_seqno_threshold)};
587
589
  if (collector) {
588
590
  table_properties_collectors.emplace_back(std::move(collector));
589
591
  }
@@ -823,6 +823,12 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
823
823
  read_options_, block_handle,
824
824
  &(block_handle_info.cachable_entry_).As<Block_kData>());
825
825
  if (!s.ok()) {
826
+ #ifndef NDEBUG
827
+ // To allow fault injection verification to pass since non-okay status in
828
+ // `BlockCacheLookupForReadAheadSize()` won't fail the read but to have
829
+ // less or no readahead
830
+ IGNORE_STATUS_IF_ERROR(s);
831
+ #endif
826
832
  break;
827
833
  }
828
834
 
@@ -852,6 +858,15 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
852
858
  is_index_at_curr_block_ = false;
853
859
  }
854
860
 
861
+ #ifndef NDEBUG
862
+ // To allow fault injection verification to pass since non-okay status in
863
+ // `BlockCacheLookupForReadAheadSize()` won't fail the read but to have less
864
+ // or no readahead
865
+ if (!index_iter_->status().ok()) {
866
+ IGNORE_STATUS_IF_ERROR(index_iter_->status());
867
+ }
868
+ #endif
869
+
855
870
  if (found_first_miss_block) {
856
871
  // Iterate cache hit block handles from the end till a Miss is there, to
857
872
  // truncate and update the end offset till that Miss.
@@ -135,7 +135,46 @@ extern const uint64_t kBlockBasedTableMagicNumber;
135
135
  extern const std::string kHashIndexPrefixesBlock;
136
136
  extern const std::string kHashIndexPrefixesMetadataBlock;
137
137
 
138
- BlockBasedTable::~BlockBasedTable() { delete rep_; }
138
+ BlockBasedTable::~BlockBasedTable() {
139
+ auto ua = rep_->uncache_aggressiveness.LoadRelaxed();
140
+ if (ua > 0 && rep_->table_options.block_cache) {
141
+ if (rep_->filter) {
142
+ rep_->filter->EraseFromCacheBeforeDestruction(ua);
143
+ }
144
+ if (rep_->index_reader) {
145
+ {
146
+ // TODO: Also uncache data blocks known after any gaps in partitioned
147
+ // index. Right now the iterator errors out as soon as there's an
148
+ // index partition not in cache.
149
+ IndexBlockIter iiter_on_stack;
150
+ ReadOptions ropts;
151
+ ropts.read_tier = kBlockCacheTier; // No I/O
152
+ auto iiter = NewIndexIterator(
153
+ ropts, /*disable_prefix_seek=*/false, &iiter_on_stack,
154
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr);
155
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
156
+ if (iiter != &iiter_on_stack) {
157
+ iiter_unique_ptr.reset(iiter);
158
+ }
159
+ // Un-cache the data blocks the index iterator with tell us about
160
+ // without I/O. (NOTE: It's extremely unlikely that a data block
161
+ // will be in block cache without the index block pointing to it
162
+ // also in block cache.)
163
+ UncacheAggressivenessAdvisor advisor(ua);
164
+ for (iiter->SeekToFirst(); iiter->Valid() && advisor.ShouldContinue();
165
+ iiter->Next()) {
166
+ bool erased = EraseFromCache(iiter->value().handle);
167
+ advisor.Report(erased);
168
+ }
169
+ iiter->status().PermitUncheckedError();
170
+ }
171
+
172
+ // Un-cache the index block(s)
173
+ rep_->index_reader->EraseFromCacheBeforeDestruction(ua);
174
+ }
175
+ }
176
+ delete rep_;
177
+ }
139
178
 
140
179
  namespace {
141
180
  // Read the block identified by "handle" from "file".
@@ -439,6 +478,7 @@ bool IsFeatureSupported(const TableProperties& table_properties,
439
478
  }
440
479
 
441
480
  // Caller has to ensure seqno is not nullptr.
481
+ // Set *seqno to the global sequence number for reading this file.
442
482
  Status GetGlobalSequenceNumber(const TableProperties& table_properties,
443
483
  SequenceNumber largest_seqno,
444
484
  SequenceNumber* seqno) {
@@ -461,12 +501,17 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties,
461
501
  }
462
502
 
463
503
  uint32_t version = DecodeFixed32(version_pos->second.c_str());
464
- if (version < 2) {
465
- if (seqno_pos != props.end() || version != 1) {
466
- std::array<char, 200> msg_buf;
504
+ if (version != 2) {
505
+ std::array<char, 200> msg_buf;
506
+ if (version != 1) {
507
+ snprintf(msg_buf.data(), msg_buf.max_size(),
508
+ "An external sst file has corrupted version %u.", version);
509
+ return Status::Corruption(msg_buf.data());
510
+ }
511
+ if (seqno_pos != props.end()) {
467
512
  // This is a v1 external sst file, global_seqno is not supported.
468
513
  snprintf(msg_buf.data(), msg_buf.max_size(),
469
- "An external sst file with version %u have global seqno "
514
+ "An external sst file with version %u has global seqno "
470
515
  "property with value %s",
471
516
  version, seqno_pos->second.c_str());
472
517
  return Status::Corruption(msg_buf.data());
@@ -594,6 +639,8 @@ Status BlockBasedTable::Open(
594
639
 
595
640
  // From read_options, retain deadline, io_timeout, rate_limiter_priority, and
596
641
  // verify_checksums. In future, we may retain more options.
642
+ // TODO: audit more ReadOptions and do this in a way that brings attention
643
+ // on new ReadOptions?
597
644
  ReadOptions ro;
598
645
  ro.deadline = read_options.deadline;
599
646
  ro.io_timeout = read_options.io_timeout;
@@ -844,6 +891,10 @@ Status BlockBasedTable::PrefetchTail(
844
891
  if (tail_size != 0) {
845
892
  tail_prefetch_size = tail_size;
846
893
  } else {
894
+ // Fallback for SST files, for which tail size is not recorded in the
895
+ // manifest. Eventually, this fallback might be removed, so it's
896
+ // better to make sure that such SST files get compacted.
897
+ // See https://github.com/facebook/rocksdb/issues/12664
847
898
  if (tail_prefetch_stats != nullptr) {
848
899
  // Multiple threads may get a 0 (no history) when running in parallel,
849
900
  // but it will get cleared after the first of them finishes.
@@ -858,14 +909,15 @@ Status BlockBasedTable::PrefetchTail(
858
909
  // properties, at which point we don't yet know the index type.
859
910
  tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
860
911
 
861
- ROCKS_LOG_WARN(logger,
862
- "Tail prefetch size %zu is calculated based on heuristics",
863
- tail_prefetch_size);
864
- } else {
865
912
  ROCKS_LOG_WARN(
866
913
  logger,
867
- "Tail prefetch size %zu is calculated based on TailPrefetchStats",
868
- tail_prefetch_size);
914
+ "[%s] Tail prefetch size %zu is calculated based on heuristics.",
915
+ file->file_name().c_str(), tail_prefetch_size);
916
+ } else {
917
+ ROCKS_LOG_WARN(logger,
918
+ "[%s] Tail prefetch size %zu is calculated based on "
919
+ "TailPrefetchStats.",
920
+ file->file_name().c_str(), tail_prefetch_size);
869
921
  }
870
922
  }
871
923
  size_t prefetch_off;
@@ -1521,9 +1573,8 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
1521
1573
  Status s;
1522
1574
  CachableEntry<UncompressionDict> uncompression_dict;
1523
1575
  if (rep_->uncompression_dict_reader) {
1524
- const bool no_io = (ro.read_tier == kBlockCacheTier);
1525
1576
  s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
1526
- /* prefetch_buffer= */ nullptr, ro, no_io, ro.verify_checksums,
1577
+ /* prefetch_buffer= */ nullptr, ro,
1527
1578
  /* get_context= */ nullptr, /* lookup_context= */ nullptr,
1528
1579
  &uncompression_dict);
1529
1580
  if (!s.ok()) {
@@ -1978,14 +2029,11 @@ bool BlockBasedTable::PrefixRangeMayMatch(
1978
2029
  FilterBlockReader* const filter = rep_->filter.get();
1979
2030
  *filter_checked = false;
1980
2031
  if (filter != nullptr) {
1981
- const bool no_io = read_options.read_tier == kBlockCacheTier;
1982
-
1983
2032
  const Slice* const const_ikey_ptr = &internal_key;
1984
2033
  may_match = filter->RangeMayExist(
1985
2034
  read_options.iterate_upper_bound, user_key_without_ts, prefix_extractor,
1986
2035
  rep_->internal_comparator.user_comparator(), const_ikey_ptr,
1987
- filter_checked, need_upper_bound_check, no_io, lookup_context,
1988
- read_options);
2036
+ filter_checked, need_upper_bound_check, lookup_context, read_options);
1989
2037
  }
1990
2038
 
1991
2039
  return may_match;
@@ -2065,7 +2113,7 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2065
2113
  }
2066
2114
 
2067
2115
  bool BlockBasedTable::FullFilterKeyMayMatch(
2068
- FilterBlockReader* filter, const Slice& internal_key, const bool no_io,
2116
+ FilterBlockReader* filter, const Slice& internal_key,
2069
2117
  const SliceTransform* prefix_extractor, GetContext* get_context,
2070
2118
  BlockCacheLookupContext* lookup_context,
2071
2119
  const ReadOptions& read_options) const {
@@ -2078,7 +2126,7 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
2078
2126
  size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
2079
2127
  Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
2080
2128
  if (rep_->whole_key_filtering) {
2081
- may_match = filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr,
2129
+ may_match = filter->KeyMayMatch(user_key_without_ts, const_ikey_ptr,
2082
2130
  get_context, lookup_context, read_options);
2083
2131
  if (may_match) {
2084
2132
  RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
@@ -2092,7 +2140,7 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
2092
2140
  // FIXME ^^^: there should be no reason for Get() to depend on current
2093
2141
  // prefix_extractor at all. It should always use table_prefix_extractor.
2094
2142
  may_match = filter->PrefixMayMatch(
2095
- prefix_extractor->Transform(user_key_without_ts), no_io, const_ikey_ptr,
2143
+ prefix_extractor->Transform(user_key_without_ts), const_ikey_ptr,
2096
2144
  get_context, lookup_context, read_options);
2097
2145
  RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED);
2098
2146
  if (may_match) {
@@ -2108,7 +2156,7 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
2108
2156
  }
2109
2157
 
2110
2158
  void BlockBasedTable::FullFilterKeysMayMatch(
2111
- FilterBlockReader* filter, MultiGetRange* range, const bool no_io,
2159
+ FilterBlockReader* filter, MultiGetRange* range,
2112
2160
  const SliceTransform* prefix_extractor,
2113
2161
  BlockCacheLookupContext* lookup_context,
2114
2162
  const ReadOptions& read_options) const {
@@ -2118,7 +2166,7 @@ void BlockBasedTable::FullFilterKeysMayMatch(
2118
2166
  uint64_t before_keys = range->KeysLeft();
2119
2167
  assert(before_keys > 0); // Caller should ensure
2120
2168
  if (rep_->whole_key_filtering) {
2121
- filter->KeysMayMatch(range, no_io, lookup_context, read_options);
2169
+ filter->KeysMayMatch(range, lookup_context, read_options);
2122
2170
  uint64_t after_keys = range->KeysLeft();
2123
2171
  if (after_keys) {
2124
2172
  RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
@@ -2134,7 +2182,7 @@ void BlockBasedTable::FullFilterKeysMayMatch(
2134
2182
  } else if (!PrefixExtractorChanged(prefix_extractor)) {
2135
2183
  // FIXME ^^^: there should be no reason for MultiGet() to depend on current
2136
2184
  // prefix_extractor at all. It should always use table_prefix_extractor.
2137
- filter->PrefixesMayMatch(range, prefix_extractor, false, lookup_context,
2185
+ filter->PrefixesMayMatch(range, prefix_extractor, lookup_context,
2138
2186
  read_options);
2139
2187
  RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
2140
2188
  uint64_t after_keys = range->KeysLeft();
@@ -2240,7 +2288,6 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2240
2288
  assert(key.size() >= 8); // key must be internal key
2241
2289
  assert(get_context != nullptr);
2242
2290
  Status s;
2243
- const bool no_io = read_options.read_tier == kBlockCacheTier;
2244
2291
 
2245
2292
  FilterBlockReader* const filter =
2246
2293
  !skip_filters ? rep_->filter.get() : nullptr;
@@ -2259,7 +2306,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2259
2306
  }
2260
2307
  TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch");
2261
2308
  const bool may_match =
2262
- FullFilterKeyMayMatch(filter, key, no_io, prefix_extractor, get_context,
2309
+ FullFilterKeyMayMatch(filter, key, prefix_extractor, get_context,
2263
2310
  &lookup_context, read_options);
2264
2311
  TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
2265
2312
  if (may_match) {
@@ -2309,7 +2356,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2309
2356
  /*for_compaction=*/false, /*async_read=*/false, tmp_status,
2310
2357
  /*use_block_cache_for_lookup=*/true);
2311
2358
 
2312
- if (no_io && biter.status().IsIncomplete()) {
2359
+ if (read_options.read_tier == kBlockCacheTier &&
2360
+ biter.status().IsIncomplete()) {
2313
2361
  // couldn't get block from block_cache
2314
2362
  // Update Saver.state to Found because we are only looking for
2315
2363
  // whether we can guarantee the key is not there when "no_io" is set
@@ -2421,7 +2469,6 @@ Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options,
2421
2469
 
2422
2470
  // First check the full filter
2423
2471
  // If full filter not useful, Then go into each block
2424
- const bool no_io = read_options.read_tier == kBlockCacheTier;
2425
2472
  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
2426
2473
  if (mget_range->begin()->get_context) {
2427
2474
  tracing_mget_id = mget_range->begin()->get_context->get_tracing_get_id();
@@ -2429,8 +2476,8 @@ Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options,
2429
2476
  BlockCacheLookupContext lookup_context{
2430
2477
  TableReaderCaller::kUserMultiGet, tracing_mget_id,
2431
2478
  /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2432
- FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor,
2433
- &lookup_context, read_options);
2479
+ FullFilterKeysMayMatch(filter, mget_range, prefix_extractor, &lookup_context,
2480
+ read_options);
2434
2481
 
2435
2482
  return Status::OK();
2436
2483
  }
@@ -2663,6 +2710,24 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
2663
2710
  return s;
2664
2711
  }
2665
2712
 
2713
+ bool BlockBasedTable::EraseFromCache(const BlockHandle& handle) const {
2714
+ assert(rep_ != nullptr);
2715
+
2716
+ Cache* const cache = rep_->table_options.block_cache.get();
2717
+ if (cache == nullptr) {
2718
+ return false;
2719
+ }
2720
+
2721
+ CacheKey key = GetCacheKey(rep_->base_cache_key, handle);
2722
+
2723
+ Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice());
2724
+ if (cache_handle == nullptr) {
2725
+ return false;
2726
+ }
2727
+
2728
+ return cache->Release(cache_handle, /*erase_if_last_ref=*/true);
2729
+ }
2730
+
2666
2731
  bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
2667
2732
  assert(rep_ != nullptr);
2668
2733
 
@@ -2796,11 +2861,8 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options,
2796
2861
 
2797
2862
  BlockCacheLookupContext context(caller);
2798
2863
  IndexBlockIter iiter_on_stack;
2799
- ReadOptions ro;
2800
- ro.total_order_seek = true;
2801
- ro.io_activity = read_options.io_activity;
2802
2864
  auto index_iter =
2803
- NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2865
+ NewIndexIterator(read_options, /*disable_prefix_seek=*/true,
2804
2866
  /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2805
2867
  /*lookup_context=*/&context);
2806
2868
  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
@@ -2843,11 +2905,8 @@ uint64_t BlockBasedTable::ApproximateSize(const ReadOptions& read_options,
2843
2905
 
2844
2906
  BlockCacheLookupContext context(caller);
2845
2907
  IndexBlockIter iiter_on_stack;
2846
- ReadOptions ro;
2847
- ro.total_order_seek = true;
2848
- ro.io_activity = read_options.io_activity;
2849
2908
  auto index_iter =
2850
- NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2909
+ NewIndexIterator(read_options, /*disable_prefix_seek=*/true,
2851
2910
  /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2852
2911
  /*lookup_context=*/&context);
2853
2912
  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
@@ -3023,10 +3082,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
3023
3082
  if (rep_->uncompression_dict_reader) {
3024
3083
  CachableEntry<UncompressionDict> uncompression_dict;
3025
3084
  s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3026
- nullptr /* prefetch_buffer */, ro, false /* no_io */,
3027
- false, /* verify_checksums */
3028
- nullptr /* get_context */, nullptr /* lookup_context */,
3029
- &uncompression_dict);
3085
+ nullptr /* prefetch_buffer */, ro, nullptr /* get_context */,
3086
+ nullptr /* lookup_context */, &uncompression_dict);
3030
3087
  if (!s.ok()) {
3031
3088
  return s;
3032
3089
  }
@@ -3232,4 +3289,8 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
3232
3289
  out_stream << " ------\n";
3233
3290
  }
3234
3291
 
3292
+ void BlockBasedTable::MarkObsolete(uint32_t uncache_aggressiveness) {
3293
+ rep_->uncache_aggressiveness.StoreRelaxed(uncache_aggressiveness);
3294
+ }
3295
+
3235
3296
  } // namespace ROCKSDB_NAMESPACE
@@ -33,6 +33,7 @@
33
33
  #include "table/table_reader.h"
34
34
  #include "table/two_level_iterator.h"
35
35
  #include "trace_replay/block_cache_tracer.h"
36
+ #include "util/atomic.h"
36
37
  #include "util/coro_utils.h"
37
38
  #include "util/hash_containers.h"
38
39
 
@@ -183,6 +184,8 @@ class BlockBasedTable : public TableReader {
183
184
  Status ApproximateKeyAnchors(const ReadOptions& read_options,
184
185
  std::vector<Anchor>& anchors) override;
185
186
 
187
+ bool EraseFromCache(const BlockHandle& handle) const;
188
+
186
189
  bool TEST_BlockInCache(const BlockHandle& handle) const;
187
190
 
188
191
  // Returns true if the block for the specified key is in cache.
@@ -208,6 +211,8 @@ class BlockBasedTable : public TableReader {
208
211
  Status VerifyChecksum(const ReadOptions& readOptions,
209
212
  TableReaderCaller caller) override;
210
213
 
214
+ void MarkObsolete(uint32_t uncache_aggressiveness) override;
215
+
211
216
  ~BlockBasedTable();
212
217
 
213
218
  bool TEST_FilterBlockInCache() const;
@@ -241,6 +246,8 @@ class BlockBasedTable : public TableReader {
241
246
  FilePrefetchBuffer* /* tail_prefetch_buffer */) {
242
247
  return Status::OK();
243
248
  }
249
+ virtual void EraseFromCacheBeforeDestruction(
250
+ uint32_t /*uncache_aggressiveness*/) {}
244
251
  };
245
252
 
246
253
  class IndexReaderCommon;
@@ -462,14 +469,12 @@ class BlockBasedTable : public TableReader {
462
469
  std::unique_ptr<IndexReader>* index_reader);
463
470
 
464
471
  bool FullFilterKeyMayMatch(FilterBlockReader* filter, const Slice& user_key,
465
- const bool no_io,
466
472
  const SliceTransform* prefix_extractor,
467
473
  GetContext* get_context,
468
474
  BlockCacheLookupContext* lookup_context,
469
475
  const ReadOptions& read_options) const;
470
476
 
471
477
  void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range,
472
- const bool no_io,
473
478
  const SliceTransform* prefix_extractor,
474
479
  BlockCacheLookupContext* lookup_context,
475
480
  const ReadOptions& read_options) const;
@@ -619,11 +624,7 @@ struct BlockBasedTable::Rep {
619
624
 
620
625
  std::shared_ptr<FragmentedRangeTombstoneList> fragmented_range_dels;
621
626
 
622
- // FIXME
623
- // If true, data blocks in this file are definitely ZSTD compressed. If false
624
- // they might not be. When false we skip creating a ZSTD digested
625
- // uncompression dictionary. Even if we get a false negative, things should
626
- // still work, just not as quickly.
627
+ // Context for block cache CreateCallback
627
628
  BlockCreateContext create_context;
628
629
 
629
630
  // If global_seqno is used, all Keys in this file will have the same
@@ -672,6 +673,13 @@ struct BlockBasedTable::Rep {
672
673
  // `end_key` for range deletion entries.
673
674
  const bool user_defined_timestamps_persisted;
674
675
 
676
+ // Set to >0 when the file is known to be obsolete and should have its block
677
+ // cache entries evicted on close. NOTE: when the file becomes obsolete,
678
+ // there could be multiple table cache references that all mark this file as
679
+ // obsolete. An atomic resolves the race quite reasonably. Even in the rare
680
+ // case of such a race, they will most likely be storing the same value.
681
+ RelaxedAtomic<uint32_t> uncache_aggressiveness{0};
682
+
675
683
  std::unique_ptr<CacheReservationManager::CacheReservationHandle>
676
684
  table_reader_cache_res_handle = nullptr;
677
685