@nxtedition/rocksdb 8.2.0 → 8.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. package/binding.cc +3 -3
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +16 -52
  3. package/deps/rocksdb/rocksdb/Makefile +10 -5
  4. package/deps/rocksdb/rocksdb/TARGETS +8 -345
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +92 -0
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +32 -32
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +12 -9
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +6 -43
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +3 -13
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +8 -5
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +21 -47
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +3 -8
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +1 -2
  15. package/deps/rocksdb/rocksdb/cache/sharded_cache.cc +44 -7
  16. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +13 -14
  17. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +1 -0
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +2 -2
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +2 -1
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +17 -8
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +40 -21
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  24. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +41 -42
  25. package/deps/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc +1 -1
  26. package/deps/rocksdb/rocksdb/db/blob/blob_log_writer.cc +1 -1
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +5 -4
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +2 -2
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +5 -3
  30. package/deps/rocksdb/rocksdb/db/builder.cc +7 -6
  31. package/deps/rocksdb/rocksdb/db/builder.h +2 -2
  32. package/deps/rocksdb/rocksdb/db/c.cc +76 -5
  33. package/deps/rocksdb/rocksdb/db/c_test.c +141 -0
  34. package/deps/rocksdb/rocksdb/db/column_family.cc +32 -0
  35. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +3 -2
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +5 -0
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +8 -5
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +12 -10
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +21 -17
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +2 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -7
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +3 -1
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +77 -50
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +4 -5
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +55 -8
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +142 -56
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +1 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +1 -2
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +21 -20
  51. package/deps/rocksdb/rocksdb/db/convenience.cc +8 -6
  52. package/deps/rocksdb/rocksdb/db/corruption_test.cc +5 -4
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +6 -3
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +260 -220
  55. package/deps/rocksdb/rocksdb/db/db_clip_test.cc +142 -0
  56. package/deps/rocksdb/rocksdb/db/db_compaction_filter_test.cc +1 -1
  57. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +333 -27
  58. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +5 -0
  59. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +7 -0
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +189 -27
  61. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +23 -10
  62. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +134 -90
  63. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +2 -2
  64. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +5 -3
  65. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +5 -1
  66. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +124 -16
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +10 -0
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +7 -0
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +15 -0
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +11 -5
  71. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -8
  72. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +54 -3
  73. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +42 -0
  74. package/deps/rocksdb/rocksdb/db/db_options_test.cc +116 -1
  75. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +3 -2
  76. package/deps/rocksdb/rocksdb/db/db_rate_limiter_test.cc +3 -2
  77. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +9 -8
  78. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +142 -63
  79. package/deps/rocksdb/rocksdb/db/db_test.cc +28 -7
  80. package/deps/rocksdb/rocksdb/db/db_test2.cc +71 -131
  81. package/deps/rocksdb/rocksdb/db/db_test_util.cc +18 -0
  82. package/deps/rocksdb/rocksdb/db/db_test_util.h +6 -0
  83. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +10 -10
  84. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +25 -0
  85. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +88 -0
  86. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +67 -0
  87. package/deps/rocksdb/rocksdb/db/db_write_test.cc +5 -0
  88. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/experimental.cc +4 -2
  90. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +86 -1
  91. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +15 -2
  92. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +1 -2
  93. package/deps/rocksdb/rocksdb/db/flush_job.cc +21 -14
  94. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -7
  95. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +31 -8
  96. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +21 -19
  97. package/deps/rocksdb/rocksdb/db/internal_stats.cc +42 -12
  98. package/deps/rocksdb/rocksdb/db/internal_stats.h +1 -0
  99. package/deps/rocksdb/rocksdb/db/kv_checksum.h +92 -6
  100. package/deps/rocksdb/rocksdb/db/listener_test.cc +2 -2
  101. package/deps/rocksdb/rocksdb/db/log_format.h +8 -4
  102. package/deps/rocksdb/rocksdb/db/log_reader.cc +129 -51
  103. package/deps/rocksdb/rocksdb/db/log_reader.h +16 -0
  104. package/deps/rocksdb/rocksdb/db/log_test.cc +125 -4
  105. package/deps/rocksdb/rocksdb/db/log_writer.cc +32 -2
  106. package/deps/rocksdb/rocksdb/db/log_writer.h +16 -0
  107. package/deps/rocksdb/rocksdb/db/memtable.cc +17 -46
  108. package/deps/rocksdb/rocksdb/db/memtable.h +1 -1
  109. package/deps/rocksdb/rocksdb/db/memtable_list.cc +8 -4
  110. package/deps/rocksdb/rocksdb/db/merge_helper.cc +1 -1
  111. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +2 -1
  112. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +5 -4
  113. package/deps/rocksdb/rocksdb/db/repair.cc +38 -11
  114. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +3 -3
  115. package/deps/rocksdb/rocksdb/db/table_cache.cc +68 -51
  116. package/deps/rocksdb/rocksdb/db/table_cache.h +20 -10
  117. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +2 -1
  118. package/deps/rocksdb/rocksdb/db/table_properties_collector_test.cc +6 -3
  119. package/deps/rocksdb/rocksdb/db/version_builder.cc +9 -5
  120. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  121. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +140 -120
  122. package/deps/rocksdb/rocksdb/db/version_edit.cc +14 -0
  123. package/deps/rocksdb/rocksdb/db/version_edit.h +12 -4
  124. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +21 -13
  125. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +26 -16
  126. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +9 -9
  127. package/deps/rocksdb/rocksdb/db/version_set.cc +292 -96
  128. package/deps/rocksdb/rocksdb/db/version_set.h +53 -28
  129. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -0
  130. package/deps/rocksdb/rocksdb/db/version_set_test.cc +62 -22
  131. package/deps/rocksdb/rocksdb/db/version_util.h +5 -4
  132. package/deps/rocksdb/rocksdb/db/write_batch.cc +3 -1
  133. package/deps/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt +1 -0
  134. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +119 -27
  135. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +123 -0
  136. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +4 -0
  137. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +7 -2
  138. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +34 -0
  139. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -0
  140. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +43 -33
  141. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +29 -17
  142. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +5 -0
  143. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +6 -1
  144. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +85 -50
  145. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +96 -54
  146. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.cc +122 -0
  147. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +206 -0
  148. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +9 -1
  149. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +9 -3
  150. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +322 -92
  151. package/deps/rocksdb/rocksdb/env/env_posix.cc +12 -8
  152. package/deps/rocksdb/rocksdb/env/env_test.cc +31 -0
  153. package/deps/rocksdb/rocksdb/env/mock_env.cc +1 -1
  154. package/deps/rocksdb/rocksdb/env/unique_id_gen.h +14 -0
  155. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +1 -1
  156. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +5 -1
  157. package/deps/rocksdb/rocksdb/file/file_util.cc +3 -3
  158. package/deps/rocksdb/rocksdb/file/file_util.h +2 -0
  159. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +89 -0
  160. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +22 -7
  161. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +3 -2
  162. package/deps/rocksdb/rocksdb/file/readahead_raf.cc +1 -1
  163. package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +1 -1
  164. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +1 -1
  165. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +3 -0
  166. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +154 -74
  167. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +27 -7
  168. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +107 -28
  169. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +19 -0
  170. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +8 -0
  171. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +2 -0
  172. package/deps/rocksdb/rocksdb/include/rocksdb/memory_allocator.h +7 -1
  173. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +137 -152
  174. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +61 -26
  175. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +30 -26
  176. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +33 -16
  177. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +87 -8
  178. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +1 -1
  179. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
  180. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +1 -0
  181. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h +1 -0
  182. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -0
  183. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +0 -1
  184. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  185. package/deps/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h +9 -2
  186. package/deps/rocksdb/rocksdb/logging/env_logger.h +2 -0
  187. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc +78 -42
  188. package/deps/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h +14 -9
  189. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +1 -0
  190. package/deps/rocksdb/rocksdb/memtable/skiplist_test.cc +1 -0
  191. package/deps/rocksdb/rocksdb/memtable/write_buffer_manager.cc +4 -9
  192. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +19 -11
  193. package/deps/rocksdb/rocksdb/monitoring/instrumented_mutex.h +1 -1
  194. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +211 -555
  195. package/deps/rocksdb/rocksdb/monitoring/perf_step_timer.h +1 -1
  196. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +36 -2
  197. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +17 -7
  198. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.h +10 -7
  199. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +19 -18
  200. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.h +10 -2
  201. package/deps/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc +14 -0
  202. package/deps/rocksdb/rocksdb/options/cf_options.cc +35 -2
  203. package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
  204. package/deps/rocksdb/rocksdb/options/customizable_test.cc +1 -1
  205. package/deps/rocksdb/rocksdb/options/options.cc +12 -53
  206. package/deps/rocksdb/rocksdb/options/options_helper.cc +4 -0
  207. package/deps/rocksdb/rocksdb/options/options_parser.cc +11 -0
  208. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +32 -4
  209. package/deps/rocksdb/rocksdb/options/options_test.cc +89 -5
  210. package/deps/rocksdb/rocksdb/port/lang.h +27 -0
  211. package/deps/rocksdb/rocksdb/port/stack_trace.cc +67 -24
  212. package/deps/rocksdb/rocksdb/src.mk +2 -0
  213. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -3
  214. package/deps/rocksdb/rocksdb/table/block_based/block.cc +195 -35
  215. package/deps/rocksdb/rocksdb/table/block_based/block.h +197 -24
  216. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +71 -51
  217. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +7 -1
  218. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +4 -6
  219. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +3 -0
  220. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +43 -2
  221. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +36 -6
  222. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +266 -166
  223. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +44 -14
  224. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -1
  225. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +63 -56
  226. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +8 -2
  227. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +4 -2
  228. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +10 -0
  229. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +14 -2
  230. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +918 -2
  231. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +3 -2
  232. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +10 -9
  233. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +6 -8
  234. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +2 -2
  235. package/deps/rocksdb/rocksdb/table/block_based/flush_block_policy.cc +1 -1
  236. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +18 -23
  237. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +8 -8
  238. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +16 -32
  239. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +7 -8
  240. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +4 -5
  241. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.h +3 -3
  242. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +46 -53
  243. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +12 -12
  244. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +7 -9
  245. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +26 -23
  246. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h +2 -1
  247. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +3 -0
  248. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +4 -2
  249. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +3 -2
  250. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +7 -1
  251. package/deps/rocksdb/rocksdb/table/block_fetcher.h +1 -1
  252. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +2 -1
  253. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc +3 -2
  254. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc +5 -2
  255. package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h +4 -2
  256. package/deps/rocksdb/rocksdb/table/format.cc +4 -4
  257. package/deps/rocksdb/rocksdb/table/format.h +1 -1
  258. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -1
  259. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +33 -22
  260. package/deps/rocksdb/rocksdb/table/meta_blocks.h +4 -0
  261. package/deps/rocksdb/rocksdb/table/mock_table.cc +4 -2
  262. package/deps/rocksdb/rocksdb/table/persistent_cache_helper.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/persistent_cache_options.h +1 -1
  264. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +18 -10
  265. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.h +4 -3
  266. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +10 -7
  267. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +4 -2
  268. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +11 -0
  269. package/deps/rocksdb/rocksdb/table/table_builder.h +14 -5
  270. package/deps/rocksdb/rocksdb/table/table_properties.cc +2 -0
  271. package/deps/rocksdb/rocksdb/table/table_reader.h +6 -3
  272. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +1 -1
  273. package/deps/rocksdb/rocksdb/table/table_test.cc +291 -34
  274. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +3 -1
  275. package/deps/rocksdb/rocksdb/test_util/testharness.h +5 -0
  276. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -2
  277. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +33 -17
  278. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +3 -1
  279. package/deps/rocksdb/rocksdb/util/bloom_impl.h +2 -2
  280. package/deps/rocksdb/rocksdb/util/compression.h +1 -1
  281. package/deps/rocksdb/rocksdb/util/crc32c.cc +24 -83
  282. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +7 -9
  283. package/deps/rocksdb/rocksdb/util/file_checksum_helper.cc +4 -1
  284. package/deps/rocksdb/rocksdb/util/filter_bench.cc +1 -1
  285. package/deps/rocksdb/rocksdb/util/gflags_compat.h +9 -10
  286. package/deps/rocksdb/rocksdb/util/math.h +12 -7
  287. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +16 -18
  288. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +46 -2
  289. package/deps/rocksdb/rocksdb/util/ribbon_test.cc +6 -6
  290. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +12 -7
  291. package/deps/rocksdb/rocksdb/util/stop_watch.h +31 -13
  292. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +2 -0
  293. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -1
  294. package/deps/rocksdb/rocksdb/util/udt_util.h +77 -0
  295. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge.cc +2 -2
  296. package/deps/rocksdb/rocksdb/utilities/agg_merge/agg_merge_test.cc +1 -1
  297. package/deps/rocksdb/rocksdb/utilities/agg_merge/test_agg_merge.cc +1 -1
  298. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +1 -1
  299. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +1 -1
  300. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -1
  301. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +11 -1
  302. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +34 -1
  303. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +15 -0
  304. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +1 -1
  305. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +5 -1
  306. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +29 -1
  307. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +0 -1
  308. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -1
  309. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +6 -1
  310. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc +10 -0
  311. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +6 -1
  312. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +5 -0
  313. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -0
  314. package/package.json +1 -1
  315. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  316. package/prebuilds/linux-x64/node.napi.node +0 -0
  317. /package/deps/rocksdb/rocksdb/memory/{memory_allocator.h → memory_allocator_impl.h} +0 -0
  318. /package/deps/rocksdb/rocksdb/monitoring/{statistics.h → statistics_impl.h} +0 -0
  319. /package/deps/rocksdb/rocksdb/table/block_based/{flush_block_policy.h → flush_block_policy_impl.h} +0 -0
  320. /package/deps/rocksdb/rocksdb/util/{rate_limiter.h → rate_limiter_impl.h} +0 -0
  321. /package/deps/rocksdb/rocksdb/utilities/agg_merge/{agg_merge.h → agg_merge_impl.h} +0 -0
@@ -14,6 +14,7 @@
14
14
  #include <string>
15
15
  #include <vector>
16
16
 
17
+ #include "db/kv_checksum.h"
17
18
  #include "db/pinned_iterators_manager.h"
18
19
  #include "port/malloc.h"
19
20
  #include "rocksdb/advanced_cache.h"
@@ -240,6 +241,34 @@ class Block {
240
241
  // For TypedCacheInterface
241
242
  const Slice& ContentSlice() const { return contents_.data; }
242
243
 
244
+ // Initializes per key-value checksum protection.
245
+ // After this method is called, each DataBlockIterator returned
246
+ // by NewDataIterator will verify per key-value checksum for any key it read.
247
+ void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
248
+ const Comparator* raw_ucmp);
249
+
250
+ // Initializes per key-value checksum protection.
251
+ // After this method is called, each IndexBlockIterator returned
252
+ // by NewIndexIterator will verify per key-value checksum for any key it read.
253
+ // value_is_full and index_has_first_key are needed to be able to parse
254
+ // the index block content and construct checksums.
255
+ void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
256
+ const Comparator* raw_ucmp,
257
+ bool value_is_full,
258
+ bool index_has_first_key);
259
+
260
+ // Initializes per key-value checksum protection.
261
+ // After this method is called, each MetaBlockIter returned
262
+ // by NewMetaIterator will verify per key-value checksum for any key it read.
263
+ void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key);
264
+
265
+ static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len,
266
+ const Slice& key, const Slice& value) {
267
+ ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr);
268
+ }
269
+
270
+ const char* TEST_GetKVChecksum() const { return kv_checksum_; }
271
+
243
272
  private:
244
273
  BlockContents contents_;
245
274
  const char* data_; // contents_.data.data()
@@ -247,6 +276,11 @@ class Block {
247
276
  uint32_t restart_offset_; // Offset in data_ of restart array
248
277
  uint32_t num_restarts_;
249
278
  std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
279
+ char* kv_checksum_{nullptr};
280
+ uint32_t checksum_size_{0};
281
+ // Used by block iterators to calculate current key index within a block
282
+ uint32_t block_restart_interval_{0};
283
+ uint8_t protection_bytes_per_key_{0};
250
284
  DataBlockHashIndex data_block_hash_index_;
251
285
  };
252
286
 
@@ -269,6 +303,14 @@ class Block {
269
303
  // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
270
304
  // "Impl" functions are responsible for positioning `raw_key_` but not
271
305
  // invoking `UpdateKey()`.
306
+ //
307
+ // Per key-value checksum is enabled if relevant states are passed in during
308
+ // `InitializeBase()`. The checksum verification is done in each call to
309
+ // UpdateKey() for the current key. Each subclass is responsible for keeping
310
+ // track of cur_entry_idx_, the index of the current key within the block.
311
+ // BlockIter uses this index to get the corresponding checksum for current key.
312
+ // Additional checksum verification may be done in subclasses if they read keys
313
+ // other than the key being processed in UpdateKey().
272
314
  template <class TValue>
273
315
  class BlockIter : public InternalIteratorBase<TValue> {
274
316
  public:
@@ -286,9 +328,16 @@ class BlockIter : public InternalIteratorBase<TValue> {
286
328
  Cleanable::Reset();
287
329
  }
288
330
 
289
- bool Valid() const override { return current_ < restarts_; }
331
+ bool Valid() const override {
332
+ // When status_ is not ok, iter should be invalid.
333
+ assert(status_.ok() || current_ >= restarts_);
334
+ return current_ < restarts_;
335
+ }
290
336
 
291
337
  virtual void SeekToFirst() override final {
338
+ #ifndef NDEBUG
339
+ if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return;
340
+ #endif
292
341
  SeekToFirstImpl();
293
342
  UpdateKey();
294
343
  }
@@ -325,6 +374,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
325
374
  }
326
375
 
327
376
  Status status() const override { return status_; }
377
+
328
378
  Slice key() const override {
329
379
  assert(Valid());
330
380
  return key_;
@@ -337,10 +387,22 @@ class BlockIter : public InternalIteratorBase<TValue> {
337
387
  (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
338
388
  status_.PermitUncheckedError();
339
389
  }
390
+
340
391
  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
341
392
  pinned_iters_mgr_ = pinned_iters_mgr;
342
393
  }
394
+
343
395
  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
396
+
397
+ bool TEST_Corrupt_Callback(const std::string& sync_point) {
398
+ bool corrupt = false;
399
+ TEST_SYNC_POINT_CALLBACK(sync_point, static_cast<void*>(&corrupt));
400
+
401
+ if (corrupt) {
402
+ CorruptionError();
403
+ }
404
+ return corrupt;
405
+ }
344
406
  #endif
345
407
 
346
408
  bool IsKeyPinned() const override {
@@ -377,27 +439,74 @@ class BlockIter : public InternalIteratorBase<TValue> {
377
439
  Status status_;
378
440
  // Key to be exposed to users.
379
441
  Slice key_;
442
+ SequenceNumber global_seqno_;
443
+
444
+ // Per key-value checksum related states
445
+ const char* kv_checksum_;
446
+ int32_t cur_entry_idx_;
447
+ uint32_t block_restart_interval_;
448
+ uint8_t protection_bytes_per_key_;
449
+
380
450
  bool key_pinned_;
381
451
  // Whether the block data is guaranteed to outlive this iterator, and
382
452
  // as long as the cleanup functions are transferred to another class,
383
453
  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
384
454
  bool block_contents_pinned_;
385
- SequenceNumber global_seqno_;
386
455
 
387
456
  virtual void SeekToFirstImpl() = 0;
388
457
  virtual void SeekToLastImpl() = 0;
389
458
  virtual void SeekImpl(const Slice& target) = 0;
390
459
  virtual void SeekForPrevImpl(const Slice& target) = 0;
391
460
  virtual void NextImpl() = 0;
392
-
393
461
  virtual void PrevImpl() = 0;
394
462
 
463
+ // Returns the restart interval of this block.
464
+ // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized.
465
+ virtual uint32_t GetRestartInterval() {
466
+ if (num_restarts_ <= 1 || data_ == nullptr) {
467
+ return 0;
468
+ }
469
+ SeekToFirstImpl();
470
+ uint32_t end_index = GetRestartPoint(1);
471
+ uint32_t count = 1;
472
+ while (NextEntryOffset() < end_index && status_.ok()) {
473
+ assert(Valid());
474
+ NextImpl();
475
+ ++count;
476
+ }
477
+ return count;
478
+ }
479
+
480
+ // Returns the number of keys in this block.
481
+ virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) {
482
+ if (num_restarts_ == 0 || data_ == nullptr) {
483
+ return 0;
484
+ }
485
+ uint32_t count = (num_restarts_ - 1) * block_restart_interval;
486
+ // Add number of keys from the last restart interval
487
+ SeekToRestartPoint(num_restarts_ - 1);
488
+ while (NextEntryOffset() < restarts_ && status_.ok()) {
489
+ NextImpl();
490
+ ++count;
491
+ }
492
+ return count;
493
+ }
494
+
495
+ // Stores whether the current key has a shared bytes with prev key in
496
+ // *is_shared.
497
+ // Sets raw_key_, value_ to the current parsed key and value.
498
+ // Sets restart_index_ to point to the restart interval that contains
499
+ // the current key.
395
500
  template <typename DecodeEntryFunc>
396
501
  inline bool ParseNextKey(bool* is_shared);
397
502
 
503
+ // protection_bytes_per_key, kv_checksum, and block_restart_interval
504
+ // are needed only for per kv checksum verification.
398
505
  void InitializeBase(const Comparator* raw_ucmp, const char* data,
399
506
  uint32_t restarts, uint32_t num_restarts,
400
- SequenceNumber global_seqno, bool block_contents_pinned) {
507
+ SequenceNumber global_seqno, bool block_contents_pinned,
508
+ uint8_t protection_bytes_per_key, const char* kv_checksum,
509
+ uint32_t block_restart_interval) {
401
510
  assert(data_ == nullptr); // Ensure it is called only once
402
511
  assert(num_restarts > 0); // Ensure the param is valid
403
512
 
@@ -410,11 +519,41 @@ class BlockIter : public InternalIteratorBase<TValue> {
410
519
  global_seqno_ = global_seqno;
411
520
  block_contents_pinned_ = block_contents_pinned;
412
521
  cache_handle_ = nullptr;
522
+ cur_entry_idx_ = -1;
523
+ protection_bytes_per_key_ = protection_bytes_per_key;
524
+ kv_checksum_ = kv_checksum;
525
+ block_restart_interval_ = block_restart_interval;
526
+ // Checksum related states are either all 0/nullptr or all non-zero.
527
+ // One exception is when num_restarts == 0, block_restart_interval can be 0
528
+ // since we are not able to compute it.
529
+ assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) ||
530
+ (protection_bytes_per_key > 0 && kv_checksum != nullptr &&
531
+ (block_restart_interval > 0 || num_restarts == 1)));
532
+ }
533
+
534
+ void CorruptionError(const std::string& error_msg = "bad entry in block") {
535
+ current_ = restarts_;
536
+ restart_index_ = num_restarts_;
537
+ status_ = Status::Corruption(error_msg);
538
+ raw_key_.Clear();
539
+ value_.clear();
540
+ }
541
+
542
+ void PerKVChecksumCorruptionError() {
543
+ std::string error_msg{
544
+ "Corrupted block entry: per key-value checksum verification "
545
+ "failed."};
546
+ error_msg.append(" Offset: " + std::to_string(current_) + ".");
547
+ error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + ".");
548
+ CorruptionError(error_msg);
413
549
  }
414
550
 
415
551
  // Must be called every time a key is found that needs to be returned to user,
416
552
  // and may be called when no key is found (as a no-op). Updates `key_`,
417
553
  // `key_buf_`, and `key_pinned_` with info about the found key.
554
+ // Per key-value checksum verification is done if available for the key to be
555
+ // returned. Iterator is invalidated with corruption status if checksum
556
+ // verification fails.
418
557
  void UpdateKey() {
419
558
  key_buf_.Clear();
420
559
  if (!Valid()) {
@@ -433,6 +572,19 @@ class BlockIter : public InternalIteratorBase<TValue> {
433
572
  key_ = key_buf_.GetInternalKey();
434
573
  key_pinned_ = false;
435
574
  }
575
+ TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value",
576
+ (void*)value_.data());
577
+ TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len",
578
+ &protection_bytes_per_key_);
579
+ if (protection_bytes_per_key_ > 0) {
580
+ if (!ProtectionInfo64()
581
+ .ProtectKV(raw_key_.GetKey(), value_)
582
+ .Verify(
583
+ protection_bytes_per_key_,
584
+ kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) {
585
+ PerKVChecksumCorruptionError();
586
+ }
587
+ }
436
588
  }
437
589
 
438
590
  // Returns the result of `Comparator::Compare()`, where the appropriate
@@ -464,7 +616,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
464
616
  return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
465
617
  }
466
618
 
467
- uint32_t GetRestartPoint(uint32_t index) {
619
+ uint32_t GetRestartPoint(uint32_t index) const {
468
620
  assert(index < num_restarts_);
469
621
  return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
470
622
  }
@@ -479,13 +631,20 @@ class BlockIter : public InternalIteratorBase<TValue> {
479
631
  value_ = Slice(data_ + offset, 0);
480
632
  }
481
633
 
482
- void CorruptionError();
483
-
484
634
  protected:
485
635
  template <typename DecodeKeyFunc>
486
636
  inline bool BinarySeek(const Slice& target, uint32_t* index,
487
637
  bool* is_index_key_result);
488
638
 
639
+ // Find the first key in restart interval `index` that is >= `target`.
640
+ // If there is no such key, iterator is positioned at the first key in
641
+ // restart interval `index + 1`.
642
+ // If is_index_key_result is true, it positions the iterator at the first key
643
+ // in this restart interval.
644
+ // Per key-value checksum verification is done for all keys scanned
645
+ // up to but not including the last key (the key that current_ points to
646
+ // when this function returns). This key's checksum is verified in
647
+ // UpdateKey().
489
648
  void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
490
649
  bool is_index_key_result);
491
650
  };
@@ -494,22 +653,17 @@ class DataBlockIter final : public BlockIter<Slice> {
494
653
  public:
495
654
  DataBlockIter()
496
655
  : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
497
- DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
498
- uint32_t num_restarts, SequenceNumber global_seqno,
499
- BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
500
- DataBlockHashIndex* data_block_hash_index)
501
- : DataBlockIter() {
502
- Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
503
- read_amp_bitmap, block_contents_pinned, data_block_hash_index);
504
- }
505
656
  void Initialize(const Comparator* raw_ucmp, const char* data,
506
657
  uint32_t restarts, uint32_t num_restarts,
507
658
  SequenceNumber global_seqno,
508
659
  BlockReadAmpBitmap* read_amp_bitmap,
509
660
  bool block_contents_pinned,
510
- DataBlockHashIndex* data_block_hash_index) {
661
+ DataBlockHashIndex* data_block_hash_index,
662
+ uint8_t protection_bytes_per_key, const char* kv_checksum,
663
+ uint32_t block_restart_interval) {
511
664
  InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
512
- block_contents_pinned);
665
+ block_contents_pinned, protection_bytes_per_key, kv_checksum,
666
+ block_restart_interval);
513
667
  raw_key_.SetIsUserKey(false);
514
668
  read_amp_bitmap_ = read_amp_bitmap;
515
669
  last_bitmap_offset_ = current_ + 1;
@@ -527,7 +681,11 @@ class DataBlockIter final : public BlockIter<Slice> {
527
681
  return value_;
528
682
  }
529
683
 
684
+ // Returns if `target` may exist.
530
685
  inline bool SeekForGet(const Slice& target) {
686
+ #ifndef NDEBUG
687
+ if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true;
688
+ #endif
531
689
  if (!data_block_hash_index_) {
532
690
  SeekImpl(target);
533
691
  UpdateKey();
@@ -599,11 +757,14 @@ class MetaBlockIter final : public BlockIter<Slice> {
599
757
  public:
600
758
  MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
601
759
  void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
602
- bool block_contents_pinned) {
760
+ bool block_contents_pinned, uint8_t protection_bytes_per_key,
761
+ const char* kv_checksum, uint32_t block_restart_interval) {
603
762
  // Initializes the iterator with a BytewiseComparator and
604
763
  // the raw key being a user key.
605
764
  InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
606
- kDisableGlobalSequenceNumber, block_contents_pinned);
765
+ kDisableGlobalSequenceNumber, block_contents_pinned,
766
+ protection_bytes_per_key, kv_checksum,
767
+ block_restart_interval);
607
768
  raw_key_.SetIsUserKey(true);
608
769
  }
609
770
 
@@ -613,12 +774,17 @@ class MetaBlockIter final : public BlockIter<Slice> {
613
774
  }
614
775
 
615
776
  protected:
777
+ friend Block;
616
778
  void SeekToFirstImpl() override;
617
779
  void SeekToLastImpl() override;
618
780
  void SeekImpl(const Slice& target) override;
619
781
  void SeekForPrevImpl(const Slice& target) override;
620
782
  void NextImpl() override;
621
783
  void PrevImpl() override;
784
+ // Meta index block's restart interval is always 1. See
785
+ // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval.
786
+ uint32_t GetRestartInterval() override { return 1; }
787
+ uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; }
622
788
  };
623
789
 
624
790
  class IndexBlockIter final : public BlockIter<IndexValue> {
@@ -633,9 +799,13 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
633
799
  uint32_t restarts, uint32_t num_restarts,
634
800
  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
635
801
  bool have_first_key, bool key_includes_seq,
636
- bool value_is_full, bool block_contents_pinned) {
802
+ bool value_is_full, bool block_contents_pinned,
803
+ uint8_t protection_bytes_per_key, const char* kv_checksum,
804
+ uint32_t block_restart_interval) {
637
805
  InitializeBase(raw_ucmp, data, restarts, num_restarts,
638
- kDisableGlobalSequenceNumber, block_contents_pinned);
806
+ kDisableGlobalSequenceNumber, block_contents_pinned,
807
+ protection_bytes_per_key, kv_checksum,
808
+ block_restart_interval);
639
809
  raw_key_.SetIsUserKey(!key_includes_seq);
640
810
  prefix_index_ = prefix_index;
641
811
  value_delta_encoded_ = !value_is_full;
@@ -666,11 +836,17 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
666
836
  }
667
837
  }
668
838
 
839
+ Slice raw_value() const {
840
+ assert(Valid());
841
+ return value_;
842
+ }
843
+
669
844
  bool IsValuePinned() const override {
670
845
  return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
671
846
  }
672
847
 
673
848
  protected:
849
+ friend Block;
674
850
  // IndexBlockIter follows a different contract for prefix iterator
675
851
  // from data iterators.
676
852
  // If prefix of the seek key `target` exists in the file, it must
@@ -692,11 +868,8 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
692
868
  }
693
869
 
694
870
  void PrevImpl() override;
695
-
696
871
  void NextImpl() override;
697
-
698
872
  void SeekToFirstImpl() override;
699
-
700
873
  void SeekToLastImpl() override;
701
874
 
702
875
  private:
@@ -29,7 +29,7 @@
29
29
  #include "db/dbformat.h"
30
30
  #include "index_builder.h"
31
31
  #include "logging/logging.h"
32
- #include "memory/memory_allocator.h"
32
+ #include "memory/memory_allocator_impl.h"
33
33
  #include "rocksdb/cache.h"
34
34
  #include "rocksdb/comparator.h"
35
35
  #include "rocksdb/env.h"
@@ -104,9 +104,12 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
104
104
  }
105
105
  }
106
106
 
107
- bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size) {
108
- // Check to see if compressed less than 12.5%
109
- return compressed_size < uncomp_size - (uncomp_size / 8u);
107
+ bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size,
108
+ int max_compressed_bytes_per_kb) {
109
+ // For efficiency, avoid floating point and division
110
+ return compressed_size <=
111
+ (static_cast<uint64_t>(max_compressed_bytes_per_kb) * uncomp_size) >>
112
+ 10;
110
113
  }
111
114
 
112
115
  } // namespace
@@ -114,7 +117,7 @@ bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size) {
114
117
  // format_version is the block format as defined in include/rocksdb/table.h
115
118
  Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
116
119
  CompressionType* type, uint32_t format_version,
117
- bool do_sample, std::string* compressed_output,
120
+ bool allow_sample, std::string* compressed_output,
118
121
  std::string* sampled_output_fast,
119
122
  std::string* sampled_output_slow) {
120
123
  assert(type);
@@ -126,7 +129,7 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
126
129
  // The users can use these stats to decide if it is worthwhile
127
130
  // enabling compression and they also get a hint about which
128
131
  // compression algorithm wil be beneficial.
129
- if (do_sample && info.SampleForCompression() &&
132
+ if (allow_sample && info.SampleForCompression() &&
130
133
  Random::GetTLSInstance()->OneIn(
131
134
  static_cast<int>(info.SampleForCompression()))) {
132
135
  // Sampling with a fast compression algorithm
@@ -159,7 +162,8 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
159
162
  }
160
163
  }
161
164
 
162
- if (info.type() == kNoCompression) {
165
+ int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb;
166
+ if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) {
163
167
  *type = kNoCompression;
164
168
  return uncompressed_data;
165
169
  }
@@ -175,8 +179,8 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
175
179
 
176
180
  // Check the compression ratio; if it's not good enough, just fall back to
177
181
  // uncompressed
178
- if (!GoodCompressionRatio(compressed_output->size(),
179
- uncompressed_data.size())) {
182
+ if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(),
183
+ max_compressed_bytes_per_kb)) {
180
184
  *type = kNoCompression;
181
185
  return uncompressed_data;
182
186
  }
@@ -337,6 +341,10 @@ struct BlockBasedTableBuilder::Rep {
337
341
  std::unique_ptr<ParallelCompressionRep> pc_rep;
338
342
  BlockCreateContext create_context;
339
343
 
344
+ // The size of the "tail" part of a SST file. "Tail" refers to
345
+ // all blocks after data blocks till the end of the SST file.
346
+ uint64_t tail_size;
347
+
340
348
  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
341
349
  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
342
350
 
@@ -446,7 +454,13 @@ struct BlockBasedTableBuilder::Rep {
446
454
  table_options, data_block)),
447
455
  create_context(&table_options, ioptions.stats,
448
456
  compression_type == kZSTD ||
449
- compression_type == kZSTDNotFinalCompression),
457
+ compression_type == kZSTDNotFinalCompression,
458
+ tbo.moptions.block_protection_bytes_per_key,
459
+ tbo.internal_comparator.user_comparator(),
460
+ !use_delta_encoding_for_index_values,
461
+ table_opt.index_type ==
462
+ BlockBasedTableOptions::kBinarySearchWithFirstKey),
463
+ tail_size(0),
450
464
  status_ok(true),
451
465
  io_status_ok(true) {
452
466
  if (tbo.target_file_size == 0) {
@@ -1108,25 +1122,17 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
1108
1122
  const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
1109
1123
  std::string* compressed_output, Slice* block_contents,
1110
1124
  CompressionType* type, Status* out_status) {
1111
- // File format contains a sequence of blocks where each block has:
1112
- // block_data: uint8[n]
1113
- // type: uint8
1114
- // crc: uint32
1115
1125
  Rep* r = rep_;
1116
1126
  bool is_status_ok = ok();
1117
1127
  if (!r->IsParallelCompressionEnabled()) {
1118
1128
  assert(is_status_ok);
1119
1129
  }
1120
1130
 
1121
- *type = r->compression_type;
1122
- uint64_t sample_for_compression = r->sample_for_compression;
1123
- bool abort_compression = false;
1124
-
1125
- StopWatchNano timer(
1126
- r->ioptions.clock,
1127
- ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
1128
-
1129
1131
  if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) {
1132
+ StopWatchNano timer(
1133
+ r->ioptions.clock,
1134
+ ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
1135
+
1130
1136
  if (is_data_block) {
1131
1137
  r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
1132
1138
  std::memory_order_relaxed);
@@ -1139,14 +1145,14 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
1139
1145
  }
1140
1146
  assert(compression_dict != nullptr);
1141
1147
  CompressionInfo compression_info(r->compression_opts, compression_ctx,
1142
- *compression_dict, *type,
1143
- sample_for_compression);
1148
+ *compression_dict, r->compression_type,
1149
+ r->sample_for_compression);
1144
1150
 
1145
1151
  std::string sampled_output_fast;
1146
1152
  std::string sampled_output_slow;
1147
1153
  *block_contents = CompressBlock(
1148
1154
  uncompressed_block_data, compression_info, type,
1149
- r->table_options.format_version, is_data_block /* do_sample */,
1155
+ r->table_options.format_version, is_data_block /* allow_sample */,
1150
1156
  compressed_output, &sampled_output_fast, &sampled_output_slow);
1151
1157
 
1152
1158
  if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
@@ -1179,35 +1185,38 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
1179
1185
  BlockContents contents;
1180
1186
  UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
1181
1187
  r->compression_type);
1182
- Status stat = UncompressBlockData(
1188
+ Status uncompress_status = UncompressBlockData(
1183
1189
  uncompression_info, block_contents->data(), block_contents->size(),
1184
1190
  &contents, r->table_options.format_version, r->ioptions);
1185
1191
 
1186
- if (stat.ok()) {
1187
- bool compressed_ok =
1188
- contents.data.compare(uncompressed_block_data) == 0;
1189
- if (!compressed_ok) {
1192
+ if (uncompress_status.ok()) {
1193
+ bool data_match = contents.data.compare(uncompressed_block_data) == 0;
1194
+ if (!data_match) {
1190
1195
  // The result of the compression was invalid. abort.
1191
- abort_compression = true;
1192
1196
  const char* const msg =
1193
1197
  "Decompressed block did not match pre-compression block";
1194
1198
  ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
1195
1199
  *out_status = Status::Corruption(msg);
1200
+ *type = kNoCompression;
1196
1201
  }
1197
1202
  } else {
1198
1203
  // Decompression reported an error. abort.
1199
1204
  *out_status = Status::Corruption(std::string("Could not decompress: ") +
1200
- stat.getState());
1201
- abort_compression = true;
1205
+ uncompress_status.getState());
1206
+ *type = kNoCompression;
1202
1207
  }
1203
1208
  }
1209
+ if (timer.IsStarted()) {
1210
+ RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
1211
+ timer.ElapsedNanos());
1212
+ }
1204
1213
  } else {
1205
- // Block is too big to be compressed.
1214
+ // Status is not OK, or block is too big to be compressed.
1206
1215
  if (is_data_block) {
1207
1216
  r->uncompressible_input_data_bytes.fetch_add(
1208
1217
  uncompressed_block_data.size(), std::memory_order_relaxed);
1209
1218
  }
1210
- abort_compression = true;
1219
+ *type = kNoCompression;
1211
1220
  }
1212
1221
  if (is_data_block) {
1213
1222
  r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
@@ -1216,26 +1225,32 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
1216
1225
 
1217
1226
  // Abort compression if the block is too big, or did not pass
1218
1227
  // verification.
1219
- if (abort_compression) {
1220
- RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
1221
- *type = kNoCompression;
1228
+ if (*type == kNoCompression) {
1222
1229
  *block_contents = uncompressed_block_data;
1223
- } else if (*type != kNoCompression) {
1224
- if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) {
1225
- RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
1226
- timer.ElapsedNanos());
1227
- }
1228
- RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED,
1229
- uncompressed_block_data.size());
1230
+ bool compression_attempted = !compressed_output->empty();
1231
+ RecordTick(r->ioptions.stats, compression_attempted
1232
+ ? NUMBER_BLOCK_COMPRESSION_REJECTED
1233
+ : NUMBER_BLOCK_COMPRESSION_BYPASSED);
1234
+ RecordTick(r->ioptions.stats,
1235
+ compression_attempted ? BYTES_COMPRESSION_REJECTED
1236
+ : BYTES_COMPRESSION_BYPASSED,
1237
+ uncompressed_block_data.size());
1238
+ } else {
1230
1239
  RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED);
1231
- } else if (*type != r->compression_type) {
1232
- RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
1240
+ RecordTick(r->ioptions.stats, BYTES_COMPRESSED_FROM,
1241
+ uncompressed_block_data.size());
1242
+ RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO,
1243
+ compressed_output->size());
1233
1244
  }
1234
1245
  }
1235
1246
 
1236
1247
  void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
1237
- const Slice& block_contents, CompressionType type, BlockHandle* handle,
1248
+ const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
1238
1249
  BlockType block_type, const Slice* uncompressed_block_data) {
1250
+ // File format contains a sequence of blocks where each block has:
1251
+ // block_data: uint8[n]
1252
+ // compression_type: uint8
1253
+ // checksum: uint32
1239
1254
  Rep* r = rep_;
1240
1255
  bool is_data_block = block_type == BlockType::kData;
1241
1256
  // Old, misleading name of this function: WriteRawBlock
@@ -1246,7 +1261,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
1246
1261
  assert(io_status().ok());
1247
1262
  if (uncompressed_block_data == nullptr) {
1248
1263
  uncompressed_block_data = &block_contents;
1249
- assert(type == kNoCompression);
1264
+ assert(comp_type == kNoCompression);
1250
1265
  }
1251
1266
 
1252
1267
  {
@@ -1258,10 +1273,10 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
1258
1273
  }
1259
1274
 
1260
1275
  std::array<char, kBlockTrailerSize> trailer;
1261
- trailer[0] = type;
1276
+ trailer[0] = comp_type;
1262
1277
  uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
1263
1278
  r->table_options.checksum, block_contents.data(), block_contents.size(),
1264
- /*last_byte*/ type);
1279
+ /*last_byte*/ comp_type);
1265
1280
 
1266
1281
  if (block_type == BlockType::kFilter) {
1267
1282
  Status s = r->filter_builder->MaybePostVerifyFilter(block_contents);
@@ -1898,6 +1913,8 @@ Status BlockBasedTableBuilder::Finish() {
1898
1913
  }
1899
1914
  }
1900
1915
 
1916
+ r->props.tail_start_offset = r->offset;
1917
+
1901
1918
  // Write meta blocks, metaindex block and footer in the following order.
1902
1919
  // 1. [meta block: filter]
1903
1920
  // 2. [meta block: index]
@@ -1925,6 +1942,7 @@ Status BlockBasedTableBuilder::Finish() {
1925
1942
  r->SetStatus(r->CopyIOStatus());
1926
1943
  Status ret_status = r->CopyStatus();
1927
1944
  assert(!ret_status.ok() || io_status().ok());
1945
+ r->tail_size = r->offset - r->props.tail_start_offset;
1928
1946
  return ret_status;
1929
1947
  }
1930
1948
 
@@ -1958,6 +1976,8 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
1958
1976
  }
1959
1977
  }
1960
1978
 
1979
+ uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; }
1980
+
1961
1981
  bool BlockBasedTableBuilder::NeedCompact() const {
1962
1982
  for (const auto& collector : rep_->table_properties_collectors) {
1963
1983
  if (collector->NeedCompact()) {