@nxtedition/rocksdb 15.4.1 → 16.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (401) hide show
  1. package/binding.cc +70 -23
  2. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  3. package/deps/rocksdb/rocksdb/BUCK +42 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  5. package/deps/rocksdb/rocksdb/Makefile +59 -32
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  8. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  9. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  10. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  11. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  14. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  17. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  24. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  26. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  27. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  28. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  29. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  31. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  32. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  51. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  52. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  53. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  54. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  55. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  57. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  58. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  59. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  60. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  61. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  62. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  63. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  64. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  65. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  66. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  79. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  80. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  81. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  82. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  83. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  84. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  85. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  86. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  87. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  88. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  90. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  91. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  92. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  93. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  94. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  95. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  96. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  97. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  98. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  99. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  100. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  101. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  102. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  103. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  104. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  105. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  109. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  111. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  112. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  113. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  114. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  115. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  116. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  117. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  118. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  119. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  120. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  121. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  122. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  123. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  124. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  125. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  126. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  127. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  128. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  129. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  130. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  131. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  132. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  133. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  134. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  135. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  136. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  137. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  138. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  139. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  140. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  141. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  142. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  143. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  144. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  145. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  146. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  147. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  148. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  150. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  151. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  160. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  161. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  162. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  163. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  164. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  165. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  166. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  167. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  168. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  169. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  170. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  171. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  172. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  173. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  174. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  175. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  176. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  177. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  179. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  180. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  181. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  182. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  183. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  185. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  187. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  188. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  189. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  192. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  193. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  194. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  198. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  200. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  202. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  203. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  204. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  205. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  206. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  210. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  211. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  212. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  213. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  214. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  215. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  216. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  217. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  218. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  219. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  220. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  221. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  222. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  223. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  224. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  225. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  226. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  227. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  228. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  229. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  230. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  231. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  232. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  233. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  234. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  235. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  236. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  237. package/deps/rocksdb/rocksdb/src.mk +12 -0
  238. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  239. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  240. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  241. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  242. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  243. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  253. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  254. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  255. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  256. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  257. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  258. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  259. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  260. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  261. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  264. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  265. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  266. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  267. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  268. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  269. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  270. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  273. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  274. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  275. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  276. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  277. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  278. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  279. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  280. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  281. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  282. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  283. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  284. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  286. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  287. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  288. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  289. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  290. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  291. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  292. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  293. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  294. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  295. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  296. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  297. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  298. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  299. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  300. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  301. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  302. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  303. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  304. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  305. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  306. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  307. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  308. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  309. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  310. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  311. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  312. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  313. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  314. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  315. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  316. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  317. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  318. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  319. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  320. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  321. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  322. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  323. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  324. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  325. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  326. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  327. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  328. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  329. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  331. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  332. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  333. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  334. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  335. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  336. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  337. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  338. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  339. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  340. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  341. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  342. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  343. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  344. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  355. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  356. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  358. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  360. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  361. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  362. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  364. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  365. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  366. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  367. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  368. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  369. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  370. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  371. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  373. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  375. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  376. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  377. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  378. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  380. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  381. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  388. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  389. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  390. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  391. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  392. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  393. package/deps/rocksdb/rocksdb.gyp +7 -0
  394. package/index.js +70 -10
  395. package/iterator.js +25 -3
  396. package/max_rev_operator.h +9 -5
  397. package/package.json +1 -1
  398. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  399. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  400. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -21,137 +21,14 @@
21
21
  #include "port/stack_trace.h"
22
22
  #include "rocksdb/comparator.h"
23
23
  #include "table/block_based/block_prefix_index.h"
24
+ #include "table/block_based/block_util.h"
24
25
  #include "table/block_based/data_block_footer.h"
25
26
  #include "table/format.h"
26
27
  #include "util/coding.h"
28
+ #include "util/math.h"
27
29
 
28
30
  namespace ROCKSDB_NAMESPACE {
29
31
 
30
- // Helper routine: decode the next block entry starting at "p",
31
- // storing the number of shared key bytes, non_shared key bytes,
32
- // and the length of the value in "*shared", "*non_shared", and
33
- // "*value_length", respectively. Will not dereference past "limit".
34
- //
35
- // If any errors are detected, returns nullptr. Otherwise, returns a
36
- // pointer to the key delta (just past the three decoded values).
37
- struct DecodeEntry {
38
- inline const char* operator()(const char* p, const char* limit,
39
- uint32_t* shared, uint32_t* non_shared,
40
- uint32_t* value_length) {
41
- // We need 2 bytes for shared and non_shared size. We also need one more
42
- // byte either for value size or the actual value in case of value delta
43
- // encoding.
44
- assert(limit - p >= 3);
45
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
46
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
47
- *value_length = reinterpret_cast<const unsigned char*>(p)[2];
48
- if ((*shared | *non_shared | *value_length) < 128) {
49
- // Fast path: all three values are encoded in one byte each
50
- p += 3;
51
- } else {
52
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
53
- return nullptr;
54
- }
55
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
56
- return nullptr;
57
- }
58
- if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
59
- return nullptr;
60
- }
61
- }
62
-
63
- // Using an assert in place of "return null" since we should not pay the
64
- // cost of checking for corruption on every single key decoding
65
- assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
66
- return p;
67
- }
68
- };
69
-
70
- // Helper routine: similar to DecodeEntry but does not have assertions.
71
- // Instead, returns nullptr so that caller can detect and report failure.
72
- struct CheckAndDecodeEntry {
73
- inline const char* operator()(const char* p, const char* limit,
74
- uint32_t* shared, uint32_t* non_shared,
75
- uint32_t* value_length) {
76
- // We need 2 bytes for shared and non_shared size. We also need one more
77
- // byte either for value size or the actual value in case of value delta
78
- // encoding.
79
- if (limit - p < 3) {
80
- return nullptr;
81
- }
82
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
83
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
84
- *value_length = reinterpret_cast<const unsigned char*>(p)[2];
85
- if ((*shared | *non_shared | *value_length) < 128) {
86
- // Fast path: all three values are encoded in one byte each
87
- p += 3;
88
- } else {
89
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
90
- return nullptr;
91
- }
92
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
93
- return nullptr;
94
- }
95
- if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
96
- return nullptr;
97
- }
98
- }
99
-
100
- if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
101
- return nullptr;
102
- }
103
- return p;
104
- }
105
- };
106
-
107
- struct DecodeKey {
108
- inline const char* operator()(const char* p, const char* limit,
109
- uint32_t* shared, uint32_t* non_shared) {
110
- uint32_t value_length;
111
- return DecodeEntry()(p, limit, shared, non_shared, &value_length);
112
- }
113
- };
114
-
115
- // In format_version 4, which is used by index blocks, the value size is not
116
- // encoded before the entry, as the value is known to be the handle with the
117
- // known size.
118
- struct DecodeKeyV4 {
119
- inline const char* operator()(const char* p, const char* limit,
120
- uint32_t* shared, uint32_t* non_shared) {
121
- // We need 2 bytes for shared and non_shared size. We also need one more
122
- // byte either for value size or the actual value in case of value delta
123
- // encoding.
124
- if (limit - p < 3) {
125
- return nullptr;
126
- }
127
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
128
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
129
- if ((*shared | *non_shared) < 128) {
130
- // Fast path: all three values are encoded in one byte each
131
- p += 2;
132
- } else {
133
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
134
- return nullptr;
135
- }
136
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
137
- return nullptr;
138
- }
139
- }
140
- return p;
141
- }
142
- };
143
-
144
- struct DecodeEntryV4 {
145
- inline const char* operator()(const char* p, const char* limit,
146
- uint32_t* shared, uint32_t* non_shared,
147
- uint32_t* value_length) {
148
- assert(value_length);
149
-
150
- *value_length = 0;
151
- return DecodeKeyV4()(p, limit, shared, non_shared);
152
- }
153
- };
154
-
155
32
  void DataBlockIter::NextImpl() {
156
33
  #ifndef NDEBUG
157
34
  if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
@@ -160,28 +37,24 @@ void DataBlockIter::NextImpl() {
160
37
  #endif
161
38
  bool is_shared = false;
162
39
  ParseNextDataKey(&is_shared);
163
- ++cur_entry_idx_;
164
40
  }
165
41
 
166
42
  void MetaBlockIter::NextImpl() {
167
43
  bool is_shared = false;
168
- ParseNextKey<CheckAndDecodeEntry>(&is_shared);
169
- ++cur_entry_idx_;
44
+ ParseNextKey<DecodeEntry, true>(&is_shared);
170
45
  }
171
46
 
172
- void IndexBlockIter::NextImpl() {
173
- ParseNextIndexKey();
174
- ++cur_entry_idx_;
175
- }
47
+ void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
176
48
 
177
49
  void IndexBlockIter::PrevImpl() {
178
50
  assert(Valid());
179
51
  // Scan backwards to a restart point before current_
180
52
  const uint32_t original = current_;
53
+ const auto prev_entry_idx = cur_entry_idx_ - 1;
181
54
  while (GetRestartPoint(restart_index_) >= original) {
182
55
  if (restart_index_ == 0) {
183
56
  // No more entries
184
- current_ = restarts_;
57
+ current_ = GetKeysEndOffset();
185
58
  restart_index_ = num_restarts_;
186
59
  return;
187
60
  }
@@ -191,17 +64,18 @@ void IndexBlockIter::PrevImpl() {
191
64
  // Loop until end of current entry hits the start of original entry
192
65
  while (ParseNextIndexKey() && NextEntryOffset() < original) {
193
66
  }
194
- --cur_entry_idx_;
67
+ cur_entry_idx_ = prev_entry_idx;
195
68
  }
196
69
 
197
70
  void MetaBlockIter::PrevImpl() {
198
71
  assert(Valid());
199
72
  // Scan backwards to a restart point before current_
200
73
  const uint32_t original = current_;
74
+ const auto prev_entry_idx = cur_entry_idx_ - 1;
201
75
  while (GetRestartPoint(restart_index_) >= original) {
202
76
  if (restart_index_ == 0) {
203
77
  // No more entries
204
- current_ = restarts_;
78
+ current_ = GetKeysEndOffset();
205
79
  restart_index_ = num_restarts_;
206
80
  return;
207
81
  }
@@ -210,19 +84,19 @@ void MetaBlockIter::PrevImpl() {
210
84
  SeekToRestartPoint(restart_index_);
211
85
  bool is_shared = false;
212
86
  // Loop until end of current entry hits the start of original entry
213
- while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
87
+ while (ParseNextKey<DecodeEntry, true>(&is_shared) &&
214
88
  NextEntryOffset() < original) {
215
89
  }
216
- --cur_entry_idx_;
90
+ cur_entry_idx_ = prev_entry_idx;
217
91
  }
218
92
 
219
93
  // Similar to IndexBlockIter::PrevImpl but also caches the prev entries
220
94
  void DataBlockIter::PrevImpl() {
221
95
  assert(Valid());
222
96
 
97
+ const auto prev_entry_idx = cur_entry_idx_ - 1;
223
98
  assert(prev_entries_idx_ == -1 ||
224
99
  static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
225
- --cur_entry_idx_;
226
100
  // Check if we can use cached prev_entries_
227
101
  if (prev_entries_idx_ > 0 &&
228
102
  prev_entries_[prev_entries_idx_].offset == current_) {
@@ -252,7 +126,9 @@ void DataBlockIter::PrevImpl() {
252
126
  // (i.e., keys in it are not actually pinned).
253
127
  raw_key_.SetKey(current_key, raw_key_cached /* copy */);
254
128
  value_ = current_prev_entry.value;
255
-
129
+ // Set entry_ using stored entry_size for NextEntryOffset() to work
130
+ entry_ = Slice(data_ + current_, current_prev_entry.entry_size);
131
+ cur_entry_idx_ = prev_entry_idx;
256
132
  return;
257
133
  }
258
134
 
@@ -266,15 +142,15 @@ void DataBlockIter::PrevImpl() {
266
142
  while (GetRestartPoint(restart_index_) >= original) {
267
143
  if (restart_index_ == 0) {
268
144
  // No more entries
269
- current_ = restarts_;
145
+ current_ = GetKeysEndOffset();
270
146
  restart_index_ = num_restarts_;
147
+ cur_entry_idx_ = prev_entry_idx;
271
148
  return;
272
149
  }
273
150
  restart_index_--;
274
151
  }
275
152
 
276
153
  SeekToRestartPoint(restart_index_);
277
-
278
154
  do {
279
155
  bool is_shared = false;
280
156
  if (!ParseNextDataKey(&is_shared)) {
@@ -284,19 +160,22 @@ void DataBlockIter::PrevImpl() {
284
160
 
285
161
  if (raw_key_.IsKeyPinned()) {
286
162
  // The key is not delta encoded
287
- prev_entries_.emplace_back(current_, current_key.data(), 0,
288
- current_key.size(), value());
163
+ prev_entries_.emplace_back(current_, static_cast<uint32_t>(entry_.size()),
164
+ current_key.data(), 0, current_key.size(),
165
+ value());
289
166
  } else {
290
167
  // The key is delta encoded, cache decoded key in buffer
291
168
  size_t new_key_offset = prev_entries_keys_buff_.size();
292
169
  prev_entries_keys_buff_.append(current_key.data(), current_key.size());
293
170
 
294
- prev_entries_.emplace_back(current_, nullptr, new_key_offset,
295
- current_key.size(), value());
171
+ prev_entries_.emplace_back(current_, static_cast<uint32_t>(entry_.size()),
172
+ nullptr, new_key_offset, current_key.size(),
173
+ value());
296
174
  }
297
175
  // Loop until end of current entry hits the start of original entry
298
176
  } while (NextEntryOffset() < original);
299
177
  prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
178
+ cur_entry_idx_ = prev_entry_idx;
300
179
  }
301
180
 
302
181
  void DataBlockIter::SeekImpl(const Slice& target) {
@@ -307,7 +186,8 @@ void DataBlockIter::SeekImpl(const Slice& target) {
307
186
  }
308
187
  uint32_t index = 0;
309
188
  bool skip_linear_scan = false;
310
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
189
+ bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
190
+ &skip_linear_scan);
311
191
 
312
192
  if (!ok) {
313
193
  return;
@@ -323,7 +203,8 @@ void MetaBlockIter::SeekImpl(const Slice& target) {
323
203
  }
324
204
  uint32_t index = 0;
325
205
  bool skip_linear_scan = false;
326
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
206
+ bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
207
+ &skip_linear_scan);
327
208
 
328
209
  if (!ok) {
329
210
  return;
@@ -393,15 +274,12 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
393
274
  assert(restart_index < num_restarts_);
394
275
  SeekToRestartPoint(restart_index);
395
276
  current_ = GetRestartPoint(restart_index);
396
- cur_entry_idx_ =
397
- static_cast<int32_t>(restart_index * block_restart_interval_) - 1;
398
277
 
399
- uint32_t limit = restarts_;
278
+ uint32_t limit = GetKeysEndOffset();
400
279
  if (restart_index + 1 < num_restarts_) {
401
280
  limit = GetRestartPoint(restart_index + 1);
402
281
  }
403
282
  while (current_ < limit) {
404
- ++cur_entry_idx_;
405
283
  bool shared;
406
284
  // Here we only linear seek the target key inside the restart interval.
407
285
  // If a key does not exist inside a restart interval, we avoid
@@ -440,8 +318,8 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
440
318
  return true;
441
319
  }
442
320
 
443
- if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
444
- target_user_key) != 0) {
321
+ if (icmp_.user_comparator()->Compare(raw_key_.GetUserKey(),
322
+ target_user_key) != 0) {
445
323
  // the key is not in this block and cannot be at the next block either.
446
324
  return false;
447
325
  }
@@ -488,16 +366,20 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
488
366
  // This is to let the caller to distinguish between non-existing prefix,
489
367
  // and when key is larger than the last key, which both set Valid() to
490
368
  // false.
491
- current_ = restarts_;
369
+ current_ = GetKeysEndOffset();
492
370
  status_ = Status::NotFound();
493
371
  }
494
372
  // restart interval must be one when hash search is enabled so the binary
495
373
  // search simply lands at the right place.
496
374
  skip_linear_scan = true;
497
- } else if (value_delta_encoded_) {
498
- ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
499
375
  } else {
500
- ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
376
+ if (value_delta_encoded_) {
377
+ ok = FindRestartPointForSeek<DecodeKeyV4>(seek_key, &index,
378
+ &skip_linear_scan);
379
+ } else {
380
+ ok = FindRestartPointForSeek<DecodeKey>(seek_key, &index,
381
+ &skip_linear_scan);
382
+ }
501
383
  }
502
384
 
503
385
  if (!ok) {
@@ -506,6 +388,18 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
506
388
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
507
389
  }
508
390
 
391
+ template <typename DecodeKeyFunc>
392
+ bool IndexBlockIter::FindRestartPointForSeek(const Slice& seek_key,
393
+ uint32_t* index,
394
+ bool* skip_linear_scan) {
395
+ if (index_search_type_ == BlockBasedTableOptions::kBinary) {
396
+ return BinarySeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
397
+ skip_linear_scan);
398
+ }
399
+ return InterpolationSeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
400
+ skip_linear_scan);
401
+ }
402
+
509
403
  void DataBlockIter::SeekForPrevImpl(const Slice& target) {
510
404
  PERF_TIMER_GUARD(block_seek_nanos);
511
405
  Slice seek_key = target;
@@ -514,13 +408,13 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) {
514
408
  }
515
409
  uint32_t index = 0;
516
410
  bool skip_linear_scan = false;
517
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
411
+ bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
412
+ &skip_linear_scan);
518
413
 
519
414
  if (!ok) {
520
415
  return;
521
416
  }
522
417
  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
523
-
524
418
  if (!Valid()) {
525
419
  if (status_.ok()) {
526
420
  SeekToLastImpl();
@@ -540,7 +434,8 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
540
434
  }
541
435
  uint32_t index = 0;
542
436
  bool skip_linear_scan = false;
543
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
437
+ bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
438
+ &skip_linear_scan);
544
439
 
545
440
  if (!ok) {
546
441
  return;
@@ -565,7 +460,6 @@ void DataBlockIter::SeekToFirstImpl() {
565
460
  SeekToRestartPoint(0);
566
461
  bool is_shared = false;
567
462
  ParseNextDataKey(&is_shared);
568
- cur_entry_idx_ = 0;
569
463
  }
570
464
 
571
465
  void MetaBlockIter::SeekToFirstImpl() {
@@ -574,8 +468,7 @@ void MetaBlockIter::SeekToFirstImpl() {
574
468
  }
575
469
  SeekToRestartPoint(0);
576
470
  bool is_shared = false;
577
- ParseNextKey<CheckAndDecodeEntry>(&is_shared);
578
- cur_entry_idx_ = 0;
471
+ ParseNextKey<DecodeEntry, true>(&is_shared);
579
472
  }
580
473
 
581
474
  void IndexBlockIter::SeekToFirstImpl() {
@@ -590,7 +483,6 @@ void IndexBlockIter::SeekToFirstImpl() {
590
483
  status_ = Status::OK();
591
484
  SeekToRestartPoint(0);
592
485
  ParseNextIndexKey();
593
- cur_entry_idx_ = 0;
594
486
  }
595
487
 
596
488
  void DataBlockIter::SeekToLastImpl() {
@@ -599,10 +491,10 @@ void DataBlockIter::SeekToLastImpl() {
599
491
  }
600
492
  SeekToRestartPoint(num_restarts_ - 1);
601
493
  bool is_shared = false;
602
- cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
603
- while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
494
+
495
+ while (ParseNextDataKey(&is_shared) &&
496
+ NextEntryOffset() < GetKeysEndOffset()) {
604
497
  // Keep skipping
605
- ++cur_entry_idx_;
606
498
  }
607
499
  }
608
500
 
@@ -613,12 +505,9 @@ void MetaBlockIter::SeekToLastImpl() {
613
505
  SeekToRestartPoint(num_restarts_ - 1);
614
506
  bool is_shared = false;
615
507
  assert(num_restarts_ >= 1);
616
- cur_entry_idx_ =
617
- static_cast<int32_t>((num_restarts_ - 1) * block_restart_interval_);
618
- while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
619
- NextEntryOffset() < restarts_) {
508
+ while (ParseNextKey<DecodeEntry, true>(&is_shared) &&
509
+ NextEntryOffset() < GetKeysEndOffset()) {
620
510
  // Will probably never reach here since restart_interval is always 1
621
- ++cur_entry_idx_;
622
511
  }
623
512
  }
624
513
 
@@ -628,32 +517,54 @@ void IndexBlockIter::SeekToLastImpl() {
628
517
  }
629
518
  status_ = Status::OK();
630
519
  SeekToRestartPoint(num_restarts_ - 1);
631
- cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
632
- while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
633
- ++cur_entry_idx_;
520
+ while (ParseNextIndexKey() && NextEntryOffset() < GetKeysEndOffset()) {
634
521
  }
635
522
  }
636
523
 
637
524
  template <class TValue>
638
- template <typename DecodeEntryFunc>
525
+ template <typename DecodeEntryFunc, bool StrictCheck>
639
526
  bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
640
527
  current_ = NextEntryOffset();
528
+ ++cur_entry_idx_;
641
529
  const char* p = data_ + current_;
642
- const char* limit = data_ + restarts_; // Restarts come right after data
530
+ const char* key_limit = data_ + GetKeysEndOffset();
643
531
 
644
- if (p >= limit) {
532
+ if (p >= key_limit) {
645
533
  // No more entries to return. Mark as invalid.
646
- current_ = restarts_;
534
+ current_ = GetKeysEndOffset();
647
535
  restart_index_ = num_restarts_;
648
536
  return false;
649
537
  }
538
+
650
539
  // Decode next entry
651
540
  uint32_t shared, non_shared, value_length;
652
- p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
541
+ uint32_t value_offset = 0;
542
+
543
+ assert(cur_entry_idx_ >= 0);
544
+ assert(values_section_ == nullptr || block_restart_interval_ > 0);
545
+ bool value_offset_encoded =
546
+ values_section_ && cur_entry_idx_ % block_restart_interval_ == 0;
547
+
548
+ auto p_old = p;
549
+ p = DecodeEntryFunc()(p, key_limit, &shared, &non_shared, &value_length,
550
+ value_offset_encoded ? &value_offset : nullptr);
551
+
653
552
  if (p == nullptr || raw_key_.Size() < shared) {
654
553
  CorruptionError();
655
554
  return false;
656
555
  } else {
556
+ if constexpr (StrictCheck) {
557
+ auto entry_length =
558
+ non_shared + (values_section_ == nullptr ? value_length : 0);
559
+ if (static_cast<uint32_t>(key_limit - p) < entry_length) {
560
+ CorruptionError();
561
+ return false;
562
+ }
563
+ }
564
+
565
+ assert(values_section_ == nullptr ||
566
+ cur_entry_idx_ % block_restart_interval_ != 0 || shared == 0);
567
+ entry_ = Slice(p_old, p - p_old + non_shared);
657
568
  if (shared == 0) {
658
569
  *is_shared = false;
659
570
  // If this key doesn't share any bytes with prev key, and no min timestamp
@@ -673,15 +584,36 @@ bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
673
584
  raw_key_.TrimAppend(shared, p, non_shared);
674
585
  }
675
586
  }
676
- value_ = Slice(p + non_shared, value_length);
587
+
677
588
  if (shared == 0) {
678
589
  while (restart_index_ + 1 < num_restarts_ &&
679
590
  GetRestartPoint(restart_index_ + 1) < current_) {
680
591
  ++restart_index_;
681
592
  }
682
593
  }
683
- // else we are in the middle of a restart interval and the restart_index_
684
- // thus has not changed
594
+
595
+ if (values_section_) {
596
+ if (value_offset_encoded) {
597
+ // Restart point, derive from offset
598
+ value_ = Slice(values_section_ + value_offset, value_length);
599
+ } else {
600
+ // Non-restart point, derive from previous value
601
+ assert(value_.data() >= values_section_);
602
+ value_ = Slice(value_.data() + value_.size(), value_length);
603
+ }
604
+
605
+ if constexpr (StrictCheck) {
606
+ if ((value_.data() + value_.size()) > data_ + restarts_) {
607
+ CorruptionError();
608
+ return false;
609
+ }
610
+ }
611
+ } else {
612
+ value_ = Slice(entry_.data() + entry_.size(), value_length);
613
+ // extend entry slice to contain value as well
614
+ entry_ = Slice(entry_.data(), entry_.size() + value_.size());
615
+ }
616
+ assert((value_.data() + value_.size()) <= data_ + restarts_);
685
617
  return true;
686
618
  }
687
619
  }
@@ -741,11 +673,17 @@ bool IndexBlockIter::ParseNextIndexKey() {
741
673
  void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
742
674
  Slice v(value_.data(), data_ + restarts_ - value_.data());
743
675
  // Delta encoding is used if `shared` != 0.
676
+ assert(!value_delta_encoded_ || value_.size() == 0);
744
677
  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
745
678
  &v, have_first_key_,
746
679
  (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
747
680
  assert(decode_s.ok());
748
681
  value_ = Slice(value_.data(), v.data() - value_.data());
682
+ if (!values_section_ && value_delta_encoded_) {
683
+ assert(entry_.data() + entry_.size() == value_.data());
684
+ // values are inlined in the entry, so need to set next offset accordingly
685
+ entry_ = Slice(entry_.data(), entry_.size() + value_.size());
686
+ }
749
687
 
750
688
  if (global_seqno_state_ != nullptr) {
751
689
  // Overwrite sequence number the same way as in DataBlockIter.
@@ -783,8 +721,8 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
783
721
  // to follow it up with NextImpl() to position the iterator at the restart
784
722
  // key.
785
723
  SeekToRestartPoint(index);
786
- cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
787
724
  NextImpl();
725
+ assert(cur_entry_idx_ >= 0);
788
726
 
789
727
  if (!skip_linear_scan) {
790
728
  // Linear search (within restart block) for first key >= target
@@ -816,9 +754,28 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
816
754
  }
817
755
  }
818
756
 
819
- // Binary searches in restart array to find the starting restart point for the
820
- // linear scan, and stores it in `*index`. Assumes restart array does not
821
- // contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
757
+ // Get the key slice at a given restart point index.
758
+ template <class TValue>
759
+ template <typename DecodeKeyFunc>
760
+ bool BlockIter<TValue>::GetRestartKey(uint32_t index, Slice* key) {
761
+ uint32_t region_offset = GetRestartPoint(index);
762
+ uint32_t shared, non_shared, value_offset;
763
+ const char* key_ptr =
764
+ DecodeKeyFunc()(data_ + region_offset, data_ + restarts_, &shared,
765
+ &non_shared, values_section_ ? &value_offset : nullptr);
766
+ if (key_ptr == nullptr || (shared != 0)) {
767
+ CorruptionError();
768
+ return false;
769
+ }
770
+ *key = Slice(key_ptr, non_shared);
771
+ return true;
772
+ }
773
+
774
+ // Searches in restart array using binary search to find the starting restart
775
+ // point for the linear scan, and stores it in `*index`. Assumes restart array
776
+ // does not contain duplicate keys.
777
+ //
778
+ // It is guaranteed that the restart key at `*index + 1`
822
779
  // is strictly greater than `target` or does not exist (this can be used to
823
780
  // elide a comparison when linear scan reaches all the way to the next restart
824
781
  // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
@@ -826,15 +783,15 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
826
783
  // compared again later.
827
784
  template <class TValue>
828
785
  template <typename DecodeKeyFunc>
829
- bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
830
- bool* skip_linear_scan) {
786
+ bool BlockIter<TValue>::BinarySeekRestartPointIndex(const Slice& target,
787
+ uint32_t* index,
788
+ bool* skip_linear_scan) {
831
789
  if (restarts_ == 0) {
832
790
  // SST files dedicated to range tombstones are written with index blocks
833
791
  // that have no keys while also having `num_restarts_ == 1`. This would
834
- // cause a problem for `BinarySeek()` as it'd try to access the first key
835
- // which does not exist. We identify such blocks by the offset at which
836
- // their restarts are stored, and return false to prevent any attempted
837
- // key accesses.
792
+ // cause a problem as we'd try to access the first key which does not exist.
793
+ // We identify such blocks by the offset at which their restarts are stored,
794
+ // and return false to prevent any attempted key accesses.
838
795
  return false;
839
796
  }
840
797
 
@@ -842,23 +799,24 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
842
799
  // Loop invariants:
843
800
  // - Restart key at index `left` is less than or equal to the target key. The
844
801
  // sentinel index `-1` is considered to have a key that is less than all
845
- // keys.
802
+ // keys. Doing this allows us to avoid a bounds check on left.
846
803
  // - Any restart keys after index `right` are strictly greater than the target
847
804
  // key.
848
- int64_t left = -1, right = num_restarts_ - 1;
805
+ int64_t left = -1;
806
+ int64_t right = num_restarts_ - 1;
807
+
849
808
  while (left != right) {
850
809
  // The `mid` is computed by rounding up so it lands in (`left`, `right`].
851
810
  int64_t mid = left + (right - left + 1) / 2;
852
- uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
853
- uint32_t shared, non_shared;
854
- const char* key_ptr = DecodeKeyFunc()(
855
- data_ + region_offset, data_ + restarts_, &shared, &non_shared);
856
- if (key_ptr == nullptr || (shared != 0)) {
857
- CorruptionError();
811
+ assert(left < mid && mid <= right);
812
+
813
+ Slice mid_key;
814
+ if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
858
815
  return false;
859
816
  }
860
- Slice mid_key(key_ptr, non_shared);
817
+
861
818
  UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
819
+
862
820
  int cmp = CompareCurrentKey(target);
863
821
  if (cmp < 0) {
864
822
  // Key at "mid" is smaller than "target". Therefore all
@@ -885,22 +843,317 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
885
843
  return true;
886
844
  }
887
845
 
846
+ // Similar effects to BinarySeekRestartPointIndex, except it uses a different
847
+ // algorithm to search for the restart point index (i.e. interpolation search).
848
+ // Interpolation search is typically more efficient for uniformly distributed
849
+ // datasets.
850
+ //
851
+ // Typically, interpolation search requires an integer "value". But because we
852
+ // are searching through variable length binary slices, we must estimate an
853
+ // integer value for each key. Currently, the value is set to be the first 8
854
+ // bytes (read big-endian) that do not share a prefix with the start and end
855
+ // key. As a side effect, this can really only be used with the
856
+ // BytewiseComparator().
857
+ template <class TValue>
858
+ template <typename DecodeKeyFunc>
859
+ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
860
+ const Slice& target, uint32_t* index, bool* skip_linear_scan) {
861
+ static constexpr int64_t kGuardLen = 8;
862
+ static constexpr uint64_t kMaxPoorSearches = 8;
863
+
864
+ if (restarts_ == 0) {
865
+ return false;
866
+ }
867
+
868
+ *skip_linear_scan = false;
869
+ // Currently it is assumed that comparator is always bytewise comparator, but
870
+ // it may also be useful to to generalize to reverse bytewise in the future.
871
+ assert(icmp_.user_comparator() == BytewiseComparator());
872
+
873
+ int64_t left = -1;
874
+ int64_t right = num_restarts_ - 1;
875
+ size_t shared_user_prefix_len = 0;
876
+
877
+ Slice left_key;
878
+ Slice right_key;
879
+ Slice left_key_suffix;
880
+ Slice right_key_suffix;
881
+ Slice target_suffix = target;
882
+ bool seek_failed = false;
883
+ bool first_iter = true;
884
+ uint64_t left_val = 0;
885
+ uint64_t right_val = 0;
886
+ uint64_t target_val = 0;
887
+
888
+ // A poor search is when less than half the search space is reduced, because
889
+ // binary search would do better. When there are kMaxPoorSearches in a row,
890
+ // then fallback to binary search. This helps bound worse cast performance.
891
+ uint64_t continuous_poor_searches = 0;
892
+
893
+ // Loop invariants while not first iteration AND seek has not failed:
894
+ // - arr[usable_left] = left_key, arr[right] = right_key
895
+ // - left < mid <= right, and arr[left] < target < arr[right + 1]
896
+ //
897
+ // The first iteration is used as an early optimization to determine initial
898
+ // bounds, and whether target is within those bounds.
899
+ const bool is_user_key = raw_key_.IsUserKey();
900
+ const Slice target_user_key = is_user_key ? target : ExtractUserKey(target);
901
+ while (left != right) {
902
+ int64_t mid = 0;
903
+
904
+ // If either search window is small or we've bad numerous bad guesses, then
905
+ // fallback to binary search
906
+ seek_failed = (right - left <= kGuardLen) ||
907
+ continuous_poor_searches >= kMaxPoorSearches;
908
+
909
+ if (!seek_failed) {
910
+ // Interpolation seek reads left and right boundaries anyways, so we can
911
+ // set left = 0. The invariant that left <= target is still held because
912
+ // we early exit if left > target for the first iteration.
913
+ const uint32_t usable_left =
914
+ static_cast<uint32_t>(std::max<int64_t>(left, 0));
915
+
916
+ // First iteration: decode both boundary keys and compute shared prefix.
917
+ if (first_iter) {
918
+ if (!GetRestartKey<DecodeKeyFunc>(usable_left, &left_key)) {
919
+ return false;
920
+ }
921
+
922
+ if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
923
+ &right_key)) {
924
+ return false;
925
+ }
926
+
927
+ // Compute the shared prefix length between the user key portions of
928
+ // the boundary keys. This is used to "normalize" the values calculated
929
+ // during interpolation search.
930
+ shared_user_prefix_len = left_key.difference_offset(right_key);
931
+ if (!is_user_key) {
932
+ // Ensure shared_user_prefix_len is only limited to user key. Suppose
933
+ // that the shared prefix of both keys are extended into the internal
934
+ // footer. If they are not the same user keys, then it is guaranteed
935
+ // left is the shorter one due to bytewise comparator. For reverse
936
+ // bytewise, this would be flipped.
937
+ shared_user_prefix_len = std::min<size_t>(
938
+ shared_user_prefix_len, left_key.size() - kNumInternalBytes);
939
+ assert(shared_user_prefix_len <=
940
+ right_key.size() - kNumInternalBytes);
941
+ }
942
+
943
+ left_val =
944
+ ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
945
+ right_val =
946
+ ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
947
+ target_val =
948
+ ReadBe64FromKey(target, is_user_key, shared_user_prefix_len);
949
+ }
950
+
951
+ assert(shared_user_prefix_len <= left_key.size() &&
952
+ shared_user_prefix_len <= right_key.size());
953
+
954
+ if (first_iter && shared_user_prefix_len > 0) {
955
+ // It is not guaranteed that the shared_prefix of the left and right
956
+ // boundaries is a valid prefix of the target. If it is not, then we can
957
+ // early exit.
958
+ size_t cmp_len =
959
+ std::min(target_user_key.size(), shared_user_prefix_len);
960
+ int cmp = memcmp(target_user_key.data(), left_key.data(), cmp_len);
961
+ if (cmp < 0 || (cmp == 0 && cmp_len < shared_user_prefix_len)) {
962
+ #ifndef NDEBUG
963
+ IterKey tmp_key;
964
+ tmp_key.SetIsUserKey(is_user_key);
965
+ UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
966
+ assert(CompareKey(tmp_key, target) >= 0);
967
+ #endif
968
+ // if target size is less than shared_prefix length, and cmp == 0,
969
+ // then it is guaranteed <= left
970
+ *skip_linear_scan = true;
971
+ *index = usable_left;
972
+ return true;
973
+ } else if (cmp > 0) {
974
+ #ifndef NDEBUG
975
+ IterKey tmp_key;
976
+ tmp_key.SetIsUserKey(is_user_key);
977
+ UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
978
+ assert(CompareKey(tmp_key, target) < 0);
979
+ #endif
980
+ *index = static_cast<uint32_t>(right);
981
+ return true;
982
+ }
983
+ }
984
+
985
+ assert(shared_user_prefix_len <= target_user_key.size());
986
+ assert(memcmp(left_key.data(), target_user_key.data(),
987
+ shared_user_prefix_len) == 0);
988
+ assert(memcmp(right_key.data(), target_user_key.data(),
989
+ shared_user_prefix_len) == 0);
990
+
991
+ if (first_iter) {
992
+ left_key_suffix = Slice(left_key.data() + shared_user_prefix_len,
993
+ left_key.size() - shared_user_prefix_len);
994
+ right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
995
+ right_key.size() - shared_user_prefix_len);
996
+ target_suffix = Slice(target.data() + shared_user_prefix_len,
997
+ target.size() - shared_user_prefix_len);
998
+ }
999
+
1000
+ if (left_val > right_val) {
1001
+ CorruptionError("left key is greater than right key");
1002
+ return false;
1003
+ }
1004
+
1005
+ bool lte_left = false;
1006
+ bool gt_right = false;
1007
+
1008
+ if (target_val < left_val) {
1009
+ assert(first_iter);
1010
+ assert(CompareKey(left_key_suffix, target_suffix) > 0);
1011
+ lte_left = true;
1012
+ } else if (target_val == left_val) {
1013
+ // target_val == left_val doesn't imply target == left_key
1014
+ // because ReadBe64FromKey only reads 8 bytes and skips sequence
1015
+ // numbers. We need to check actual key order.
1016
+ if (CompareKey(left_key_suffix, target_suffix) >= 0) {
1017
+ assert(first_iter);
1018
+ lte_left = true;
1019
+ }
1020
+ }
1021
+
1022
+ if (!lte_left && !seek_failed) {
1023
+ if (target_val > right_val) {
1024
+ // note that we only ever guarantee arr[target] < arr[right + 1], so
1025
+ // it is possible to end up here even on non-first iteration
1026
+ assert(CompareKey(right_key_suffix, target_suffix) < 0);
1027
+ gt_right = true;
1028
+ } else if (right_val == left_val) {
1029
+ // cannot divide by 0
1030
+ seek_failed = true;
1031
+ }
1032
+ }
1033
+
1034
+ // early exit if key is not within bounds
1035
+ if (lte_left) {
1036
+ #ifndef NDEBUG
1037
+ assert(!seek_failed);
1038
+ IterKey tmp_key;
1039
+ tmp_key.SetIsUserKey(is_user_key);
1040
+ UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
1041
+ assert(CompareKey(tmp_key, target) >= 0);
1042
+ #endif
1043
+ *skip_linear_scan = true;
1044
+ *index = usable_left;
1045
+ return true;
1046
+ }
1047
+ if (gt_right) {
1048
+ #ifndef NDEBUG
1049
+ assert(!seek_failed);
1050
+ IterKey tmp_key;
1051
+ tmp_key.SetIsUserKey(is_user_key);
1052
+ UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
1053
+ assert(CompareKey(tmp_key, target) < 0);
1054
+ #endif
1055
+ *index = static_cast<uint32_t>(right);
1056
+ return true;
1057
+ }
1058
+
1059
+ if (!seek_failed) {
1060
+ #ifdef HAVE_UINT128_EXTENSION
1061
+ __uint128_t range = right - usable_left;
1062
+ __uint128_t target_delta = target_val - left_val;
1063
+ uint64_t range_delta = right_val - left_val;
1064
+ int64_t offset =
1065
+ static_cast<int64_t>(range * target_delta / range_delta);
1066
+ #else
1067
+ double ratio = static_cast<double>(target_val - left_val) /
1068
+ static_cast<double>(right_val - left_val);
1069
+ assert(0 <= ratio && ratio <= 1);
1070
+ int64_t range = right - usable_left;
1071
+ int64_t offset = static_cast<int64_t>(range * ratio);
1072
+ #endif
1073
+ left = usable_left; // can reduce search space by 1
1074
+ mid = usable_left + offset;
1075
+ assert(mid <= right);
1076
+ if (mid == usable_left) {
1077
+ // this is to guarantee progress and avoid infinite loop
1078
+ ++mid;
1079
+ }
1080
+ }
1081
+ }
1082
+
1083
+ if (seek_failed) {
1084
+ // Fallback to binary seek
1085
+ mid = left + (right - left + 1) / 2;
1086
+ }
1087
+
1088
+ assert(left < mid && mid <= right);
1089
+
1090
+ Slice mid_key;
1091
+ if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
1092
+ return false;
1093
+ }
1094
+
1095
+ Slice mid_key_suffix(mid_key.data() + shared_user_prefix_len,
1096
+ mid_key.size() - shared_user_prefix_len);
1097
+
1098
+ UpdateRawKeyAndMaybePadMinTimestamp(mid_key_suffix);
1099
+ int cmp = CompareCurrentKey(target_suffix);
1100
+
1101
+ int64_t previous_search_space = right - left;
1102
+ if (cmp < 0) {
1103
+ left = mid;
1104
+ left_key = mid_key;
1105
+ left_key_suffix = mid_key_suffix;
1106
+ left_val = ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
1107
+ } else if (cmp > 0) {
1108
+ right = mid - 1;
1109
+ if (!seek_failed && left != right) {
1110
+ if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
1111
+ &right_key)) {
1112
+ return false;
1113
+ }
1114
+ right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
1115
+ right_key.size() - shared_user_prefix_len);
1116
+ right_val =
1117
+ ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
1118
+ }
1119
+ } else {
1120
+ *skip_linear_scan = true;
1121
+ left = right = mid;
1122
+ }
1123
+
1124
+ // If seach space is not reduced by at least half, good chance this data is
1125
+ // not uniform.
1126
+ int64_t new_search_space = right - left;
1127
+ if (new_search_space > previous_search_space / 2) {
1128
+ ++continuous_poor_searches;
1129
+ } else {
1130
+ continuous_poor_searches = 0;
1131
+ }
1132
+
1133
+ first_iter = false;
1134
+ }
1135
+
1136
+ if (left == -1) {
1137
+ // All keys in the block were strictly greater than `target`. So the very
1138
+ // first key in the block is the final seek result.
1139
+ *skip_linear_scan = true;
1140
+ *index = 0;
1141
+ } else {
1142
+ *index = static_cast<uint32_t>(left);
1143
+ }
1144
+ return true;
1145
+ }
1146
+
888
1147
  // Compare target key and the block key of the block of `block_index`.
889
1148
  // Return -1 if error.
890
1149
  int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
891
- uint32_t region_offset = GetRestartPoint(block_index);
892
- uint32_t shared, non_shared;
893
- const char* key_ptr =
894
- value_delta_encoded_
895
- ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
896
- &non_shared)
897
- : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
898
- &non_shared);
899
- if (key_ptr == nullptr || (shared != 0)) {
900
- CorruptionError();
1150
+ Slice block_key;
1151
+ bool ok = value_delta_encoded_
1152
+ ? GetRestartKey<DecodeKeyV4>(block_index, &block_key)
1153
+ : GetRestartKey<DecodeKey>(block_index, &block_key);
1154
+ if (!ok) {
901
1155
  return 1; // Return target is smaller
902
1156
  }
903
- Slice block_key(key_ptr, non_shared);
904
1157
  UpdateRawKeyAndMaybePadMinTimestamp(block_key);
905
1158
  return CompareCurrentKey(target);
906
1159
  }
@@ -949,7 +1202,7 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
949
1202
  if (block_ids[left] > 0 &&
950
1203
  (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
951
1204
  CompareBlockKey(block_ids[left] - 1, target) > 0) {
952
- current_ = restarts_;
1205
+ current_ = GetKeysEndOffset();
953
1206
  *prefix_may_exist = false;
954
1207
  return false;
955
1208
  }
@@ -986,7 +1239,7 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
986
1239
  }
987
1240
 
988
1241
  // Mark iterator invalid
989
- current_ = restarts_;
1242
+ current_ = GetKeysEndOffset();
990
1243
  return false;
991
1244
  }
992
1245
  }
@@ -1005,7 +1258,7 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
1005
1258
  uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
1006
1259
 
1007
1260
  if (num_blocks == 0) {
1008
- current_ = restarts_;
1261
+ current_ = GetKeysEndOffset();
1009
1262
  *prefix_may_exist = false;
1010
1263
  return false;
1011
1264
  } else {
@@ -1015,39 +1268,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
1015
1268
  }
1016
1269
  }
1017
1270
 
1018
- uint32_t Block::NumRestarts() const {
1019
- assert(size() >= 2 * sizeof(uint32_t));
1020
- uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
1021
- uint32_t num_restarts = block_footer;
1022
- if (size() > kMaxBlockSizeSupportedByHashIndex) {
1023
- // In BlockBuilder, we have ensured a block with HashIndex is less than
1024
- // kMaxBlockSizeSupportedByHashIndex (64KiB).
1025
- //
1026
- // Therefore, if we encounter a block with a size > 64KiB, the block
1027
- // cannot have HashIndex. So the footer will directly interpreted as
1028
- // num_restarts.
1029
- //
1030
- // Such check is for backward compatibility. We can ensure legacy block
1031
- // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
1032
- // correctly as no HashIndex even if the MSB of num_restarts is set.
1033
- return num_restarts;
1034
- }
1035
- BlockBasedTableOptions::DataBlockIndexType index_type;
1036
- UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
1037
- return num_restarts;
1038
- }
1039
-
1040
1271
  BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
1041
- assert(size() >= 2 * sizeof(uint32_t));
1042
- if (size() > kMaxBlockSizeSupportedByHashIndex) {
1043
- // The check is for the same reason as that in NumRestarts()
1044
- return BlockBasedTableOptions::kDataBlockBinarySearch;
1045
- }
1046
- uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
1047
- uint32_t num_restarts = block_footer;
1048
- BlockBasedTableOptions::DataBlockIndexType index_type;
1049
- UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
1050
- return index_type;
1272
+ assert(size() >= DataBlockFooter::kMinEncodedLength);
1273
+ Slice input(data(), size());
1274
+ DataBlockFooter footer;
1275
+ footer.DecodeFrom(&input).PermitUncheckedError();
1276
+ return footer.index_type;
1051
1277
  }
1052
1278
 
1053
1279
  Block::~Block() {
@@ -1057,51 +1283,83 @@ Block::~Block() {
1057
1283
  delete[] kv_checksum_;
1058
1284
  }
1059
1285
 
1286
+ Status Block::GetCorruptionStatus() const {
1287
+ // Re-process the footer to get a detailed error status.
1288
+ // This should only be called when size() == 0 (error marker).
1289
+ assert(size() == 0);
1290
+ // When size() == 0 and restart_offset_ != 0, restart_offset_ stores the
1291
+ // original data size for re-decoding the footer to get detailed error.
1292
+ if (restart_offset_ == 0) {
1293
+ return Status::Corruption("bad block contents");
1294
+ }
1295
+ Slice input(contents_.data.data(), restart_offset_);
1296
+ DataBlockFooter footer;
1297
+ Status s = footer.DecodeFrom(&input);
1298
+ if (!s.ok()) {
1299
+ return s; // Return the detailed error from DecodeFrom
1300
+ }
1301
+ // Footer decoded OK, so error was in later processing (shouldn't happen)
1302
+ DEBUG_FAIL("ok status on presumed bad block contents");
1303
+ return Status::Corruption("presumed bad block contents");
1304
+ }
1305
+
1060
1306
  Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
1061
- Statistics* statistics)
1062
- : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
1307
+ Statistics* statistics, uint32_t restart_interval)
1308
+ : contents_(std::move(contents)),
1309
+ restart_offset_(0),
1310
+ num_restarts_(0),
1311
+ block_restart_interval_(restart_interval) {
1063
1312
  TEST_SYNC_POINT("Block::Block:0");
1064
1313
  auto& size = contents_.data.size_;
1065
- if (size < sizeof(uint32_t)) {
1314
+ // `contents` is assumed to be uncompressed in the proper format
1315
+ Slice input(contents_.data.data(), size);
1316
+ DataBlockFooter footer;
1317
+ Status s = footer.DecodeFrom(&input);
1318
+ if (!s.ok()) {
1319
+ // Save original size for GetCorruptionStatus() to re-decode footer
1320
+ restart_offset_ = static_cast<uint32_t>(size);
1066
1321
  size = 0; // Error marker
1067
1322
  } else {
1068
- // Should only decode restart points for uncompressed blocks
1069
- num_restarts_ = NumRestarts();
1070
- switch (IndexType()) {
1323
+ // After DecodeFrom, input has the footer (and values_section_offset if
1324
+ // separated_kv) removed. Each case below may strip additional suffix
1325
+ // (e.g., hash index) so that input ends with just the restart array.
1326
+ num_restarts_ = footer.num_restarts;
1327
+ is_uniform_ = footer.is_uniform;
1328
+ switch (footer.index_type) {
1071
1329
  case BlockBasedTableOptions::kDataBlockBinarySearch:
1072
- restart_offset_ = static_cast<uint32_t>(size) -
1073
- (1 + num_restarts_) * sizeof(uint32_t);
1074
- if (restart_offset_ > size - sizeof(uint32_t)) {
1075
- // The size is too small for NumRestarts() and therefore
1076
- // restart_offset_ wrapped around.
1077
- size = 0;
1078
- }
1079
1330
  break;
1080
1331
  case BlockBasedTableOptions::kDataBlockBinaryAndHash:
1081
- if (size < sizeof(uint32_t) /* block footer */ +
1082
- sizeof(uint16_t) /* NUM_BUCK */) {
1332
+ if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) {
1083
1333
  size = 0;
1084
1334
  break;
1085
1335
  }
1086
-
1087
1336
  uint16_t map_offset;
1088
- data_block_hash_index_.Initialize(
1089
- contents_.data.data(),
1090
- /* chop off NUM_RESTARTS */
1091
- static_cast<uint16_t>(size - sizeof(uint32_t)), &map_offset);
1092
-
1093
- restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
1094
-
1095
- if (restart_offset_ > map_offset) {
1096
- // map_offset is too small for NumRestarts() and
1097
- // therefore restart_offset_ wrapped around.
1098
- size = 0;
1099
- break;
1100
- }
1337
+ data_block_hash_index_.Initialize(contents_.data.data(),
1338
+ static_cast<uint16_t>(input.size()),
1339
+ &map_offset);
1340
+ // Strip the hash index, leaving just data + restarts
1341
+ input.remove_suffix(input.size() - map_offset);
1101
1342
  break;
1102
1343
  default:
1103
1344
  size = 0; // Error marker
1104
1345
  }
1346
+ // After the switch, input should end with restarts[num_restarts_]
1347
+ if (size != 0) {
1348
+ if (input.size() < num_restarts_ * sizeof(uint32_t)) {
1349
+ size = 0; // Block too small for the declared number of restarts
1350
+ } else {
1351
+ restart_offset_ = static_cast<uint32_t>(input.size()) -
1352
+ num_restarts_ * sizeof(uint32_t);
1353
+ }
1354
+ }
1355
+ // Set up values_section_ from footer if separated KV storage is used
1356
+ if (size != 0 && footer.separated_kv) {
1357
+ if (footer.values_section_offset > restart_offset_) {
1358
+ size = 0; // Error marker
1359
+ } else {
1360
+ values_section_ = data() + footer.values_section_offset;
1361
+ }
1362
+ }
1105
1363
  }
1106
1364
  if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
1107
1365
  read_amp_bitmap_.reset(new BlockReadAmpBitmap(
@@ -1125,7 +1383,10 @@ void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
1125
1383
  nullptr /* stats */, true /* block_contents_pinned */,
1126
1384
  true /* user_defined_timestamps_persisted */)};
1127
1385
  if (iter->status().ok()) {
1128
- block_restart_interval_ = iter->GetRestartInterval();
1386
+ // Only calculate restart interval if not already set via table properties
1387
+ if (block_restart_interval_ == 0) {
1388
+ block_restart_interval_ = iter->GetRestartInterval();
1389
+ }
1129
1390
  }
1130
1391
  uint32_t num_keys = 0;
1131
1392
  if (iter->status().ok()) {
@@ -1158,12 +1419,12 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
1158
1419
  bool index_has_first_key) {
1159
1420
  protection_bytes_per_key_ = 0;
1160
1421
  if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
1161
- // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They
1162
- // do not impact how the index block is parsed. During checksum
1422
+ // Note that `global_seqno` and `key_includes_seq` are hardcoded here.
1423
+ // They do not impact how the index block is parsed. During checksum
1163
1424
  // construction/verification, we use the entire key buffer from
1164
- // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value
1165
- // checksum, and the content of this buffer do not change for different
1166
- // values of `global_seqno` or `key_includes_seq`.
1425
+ // raw_key_.GetKey() returned by iter->key() as the `key` part of
1426
+ // key-value checksum, and the content of this buffer do not change for
1427
+ // different values of `global_seqno` or `key_includes_seq`.
1167
1428
  // TODO(yuzhangyu): handle the implication of padding timestamp for kv
1168
1429
  // protection.
1169
1430
  std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
@@ -1174,7 +1435,10 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
1174
1435
  true /* user_defined_timestamps_persisted*/,
1175
1436
  nullptr /* prefix_index */)};
1176
1437
  if (iter->status().ok()) {
1177
- block_restart_interval_ = iter->GetRestartInterval();
1438
+ // Only calculate restart interval if not already set via table properties
1439
+ if (block_restart_interval_ == 0) {
1440
+ block_restart_interval_ = iter->GetRestartInterval();
1441
+ }
1178
1442
  }
1179
1443
  uint32_t num_keys = 0;
1180
1444
  if (iter->status().ok()) {
@@ -1238,7 +1502,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
1238
1502
  MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
1239
1503
  MetaBlockIter* iter = new MetaBlockIter();
1240
1504
  if (size() < 2 * sizeof(uint32_t)) {
1241
- iter->Invalidate(Status::Corruption("bad block contents"));
1505
+ iter->Invalidate(GetCorruptionStatus());
1242
1506
  return iter;
1243
1507
  } else if (num_restarts_ == 0) {
1244
1508
  // Empty block.
@@ -1246,7 +1510,7 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
1246
1510
  } else {
1247
1511
  iter->Initialize(data(), restart_offset_, num_restarts_,
1248
1512
  block_contents_pinned, protection_bytes_per_key_,
1249
- kv_checksum_, block_restart_interval_);
1513
+ kv_checksum_, block_restart_interval_, values_section_);
1250
1514
  }
1251
1515
  return iter;
1252
1516
  }
@@ -1263,7 +1527,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
1263
1527
  ret_iter = new DataBlockIter;
1264
1528
  }
1265
1529
  if (size() < 2 * sizeof(uint32_t)) {
1266
- ret_iter->Invalidate(Status::Corruption("bad block contents"));
1530
+ ret_iter->Invalidate(GetCorruptionStatus());
1267
1531
  return ret_iter;
1268
1532
  }
1269
1533
  if (num_restarts_ == 0) {
@@ -1276,10 +1540,12 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
1276
1540
  read_amp_bitmap_.get(), block_contents_pinned,
1277
1541
  user_defined_timestamps_persisted,
1278
1542
  data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
1279
- protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
1543
+ protection_bytes_per_key_, kv_checksum_, block_restart_interval_,
1544
+ values_section_);
1280
1545
  if (read_amp_bitmap_) {
1281
1546
  if (read_amp_bitmap_->GetStatistics() != stats) {
1282
- // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
1547
+ // DB changed the Statistics pointer, we need to notify
1548
+ // read_amp_bitmap_
1283
1549
  read_amp_bitmap_->SetStatistics(stats);
1284
1550
  }
1285
1551
  }
@@ -1293,7 +1559,8 @@ IndexBlockIter* Block::NewIndexIterator(
1293
1559
  IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
1294
1560
  bool have_first_key, bool key_includes_seq, bool value_is_full,
1295
1561
  bool block_contents_pinned, bool user_defined_timestamps_persisted,
1296
- BlockPrefixIndex* prefix_index) {
1562
+ BlockPrefixIndex* prefix_index,
1563
+ BlockBasedTableOptions::BlockSearchType index_block_search_type) {
1297
1564
  IndexBlockIter* ret_iter;
1298
1565
  if (iter != nullptr) {
1299
1566
  ret_iter = iter;
@@ -1301,7 +1568,7 @@ IndexBlockIter* Block::NewIndexIterator(
1301
1568
  ret_iter = new IndexBlockIter;
1302
1569
  }
1303
1570
  if (size() < 2 * sizeof(uint32_t)) {
1304
- ret_iter->Invalidate(Status::Corruption("bad block contents"));
1571
+ ret_iter->Invalidate(GetCorruptionStatus());
1305
1572
  return ret_iter;
1306
1573
  }
1307
1574
  if (num_restarts_ == 0) {
@@ -1311,11 +1578,23 @@ IndexBlockIter* Block::NewIndexIterator(
1311
1578
  } else {
1312
1579
  BlockPrefixIndex* prefix_index_ptr =
1313
1580
  total_order_seek ? nullptr : prefix_index;
1581
+
1582
+ // Resolve kAuto to a concrete search type based on the block's
1583
+ // uniformity flag. Interpolation search requires bytewise comparator;
1584
+ // fall back to binary search otherwise.
1585
+ auto resolved_search_type = index_block_search_type;
1586
+ if (resolved_search_type == BlockBasedTableOptions::kAuto) {
1587
+ resolved_search_type = (is_uniform_ && raw_ucmp == BytewiseComparator())
1588
+ ? BlockBasedTableOptions::kInterpolation
1589
+ : BlockBasedTableOptions::kBinary;
1590
+ }
1591
+
1314
1592
  ret_iter->Initialize(
1315
1593
  raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
1316
1594
  prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
1317
1595
  block_contents_pinned, user_defined_timestamps_persisted,
1318
- protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
1596
+ protection_bytes_per_key_, kv_checksum_, block_restart_interval_,
1597
+ values_section_, resolved_search_type);
1319
1598
  }
1320
1599
 
1321
1600
  return ret_iter;