@nxtedition/rocksdb 15.4.1 → 16.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (401) hide show
  1. package/binding.cc +70 -23
  2. package/deps/rocksdb/rocksdb/.clang-tidy +86 -0
  3. package/deps/rocksdb/rocksdb/BUCK +42 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +11 -0
  5. package/deps/rocksdb/rocksdb/Makefile +59 -32
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +0 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +9 -9
  8. package/deps/rocksdb/rocksdb/cache/cache_key.cc +3 -3
  9. package/deps/rocksdb/rocksdb/cache/cache_key.h +5 -5
  10. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +16 -16
  11. package/deps/rocksdb/rocksdb/cache/cache_test.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +258 -294
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.h +98 -49
  14. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -5
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +2 -3
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +18 -18
  17. package/deps/rocksdb/rocksdb/crash_test.mk +5 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +23 -22
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +6 -1
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +14 -16
  21. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +38 -26
  22. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +5 -1
  23. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +101 -18
  24. package/deps/rocksdb/rocksdb/db/blob/blob_index.h +12 -0
  25. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +6 -9
  26. package/deps/rocksdb/rocksdb/db/builder.cc +23 -0
  27. package/deps/rocksdb/rocksdb/db/builder.h +7 -0
  28. package/deps/rocksdb/rocksdb/db/c.cc +373 -57
  29. package/deps/rocksdb/rocksdb/db/c_test.c +101 -1
  30. package/deps/rocksdb/rocksdb/db/column_family.cc +31 -3
  31. package/deps/rocksdb/rocksdb/db/column_family_test.cc +10 -13
  32. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +35 -48
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +13 -5
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +201 -39
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +15 -10
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc +7 -7
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +2 -455
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +4 -2
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +19 -0
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +72 -9
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +12 -10
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +405 -83
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +25 -1
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +23 -10
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +1 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +1410 -106
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -5
  48. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +2 -1
  49. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +19 -10
  50. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +505 -45
  51. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  52. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +9 -1
  53. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +4 -4
  54. package/deps/rocksdb/rocksdb/db/comparator_db_test.cc +7 -9
  55. package/deps/rocksdb/rocksdb/db/convenience.cc +4 -4
  56. package/deps/rocksdb/rocksdb/db/convenience_impl.h +2 -1
  57. package/deps/rocksdb/rocksdb/db/corruption_test.cc +60 -88
  58. package/deps/rocksdb/rocksdb/db/cuckoo_table_db_test.cc +10 -12
  59. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +471 -40
  60. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +116 -2
  61. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +5 -15
  62. package/deps/rocksdb/rocksdb/db/db_compaction_abort_test.cc +993 -0
  63. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +329 -29
  64. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +155 -13
  65. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +54 -31
  66. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +1 -0
  67. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +232 -70
  68. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +57 -9
  69. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +224 -31
  70. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  71. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +4 -2
  72. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +1 -1
  73. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +1 -0
  74. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +164 -8
  75. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -0
  76. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h +5 -0
  77. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +47 -35
  78. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +22 -9
  79. package/deps/rocksdb/rocksdb/db/db_iter.cc +9 -0
  80. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +371 -6
  81. package/deps/rocksdb/rocksdb/db/db_log_iter_test.cc +7 -5
  82. package/deps/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc +22 -23
  83. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +0 -2
  84. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +4 -4
  85. package/deps/rocksdb/rocksdb/db/db_options_test.cc +40 -0
  86. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +32 -13
  87. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +1 -1
  88. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +4 -4
  89. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +68 -15
  90. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +1 -1
  91. package/deps/rocksdb/rocksdb/db/db_statistics_test.cc +2 -3
  92. package/deps/rocksdb/rocksdb/db/db_table_properties_test.cc +6 -21
  93. package/deps/rocksdb/rocksdb/db/db_test.cc +644 -128
  94. package/deps/rocksdb/rocksdb/db/db_test2.cc +198 -81
  95. package/deps/rocksdb/rocksdb/db/db_test_util.cc +35 -10
  96. package/deps/rocksdb/rocksdb/db/db_test_util.h +8 -2
  97. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +36 -32
  98. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +11 -7
  99. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +499 -0
  100. package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +284 -20
  101. package/deps/rocksdb/rocksdb/db/db_write_test.cc +3 -3
  102. package/deps/rocksdb/rocksdb/db/dbformat.h +0 -5
  103. package/deps/rocksdb/rocksdb/db/error_handler.cc +24 -0
  104. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +12 -14
  105. package/deps/rocksdb/rocksdb/db/experimental.cc +13 -10
  106. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +1 -1
  107. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +22 -3
  108. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +21 -15
  109. package/deps/rocksdb/rocksdb/db/fault_injection_test.cc +4 -6
  110. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -3
  111. package/deps/rocksdb/rocksdb/db/forward_iterator_bench.cc +5 -6
  112. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +4 -2
  113. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +17 -17
  114. package/deps/rocksdb/rocksdb/db/internal_stats.cc +13 -0
  115. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -0
  116. package/deps/rocksdb/rocksdb/db/listener_test.cc +154 -27
  117. package/deps/rocksdb/rocksdb/db/manual_compaction_test.cc +6 -6
  118. package/deps/rocksdb/rocksdb/db/memtable.cc +197 -51
  119. package/deps/rocksdb/rocksdb/db/memtable.h +6 -0
  120. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +3 -4
  121. package/deps/rocksdb/rocksdb/db/merge_test.cc +37 -35
  122. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +2 -1
  123. package/deps/rocksdb/rocksdb/db/options_file_test.cc +4 -4
  124. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +9 -11
  125. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler.cc +10 -1
  126. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +292 -15
  127. package/deps/rocksdb/rocksdb/db/plain_table_db_test.cc +10 -17
  128. package/deps/rocksdb/rocksdb/db/prefix_test.cc +6 -8
  129. package/deps/rocksdb/rocksdb/db/repair.cc +10 -10
  130. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +5 -5
  131. package/deps/rocksdb/rocksdb/db/table_cache.cc +142 -135
  132. package/deps/rocksdb/rocksdb/db/table_cache.h +30 -6
  133. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -7
  134. package/deps/rocksdb/rocksdb/db/version_builder.cc +11 -50
  135. package/deps/rocksdb/rocksdb/db/version_builder.h +2 -1
  136. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +2 -1
  137. package/deps/rocksdb/rocksdb/db/version_edit.cc +51 -2
  138. package/deps/rocksdb/rocksdb/db/version_edit.h +91 -29
  139. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +7 -7
  140. package/deps/rocksdb/rocksdb/db/version_set.cc +211 -50
  141. package/deps/rocksdb/rocksdb/db/version_set.h +40 -3
  142. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +5 -0
  143. package/deps/rocksdb/rocksdb/db/version_set_test.cc +294 -21
  144. package/deps/rocksdb/rocksdb/db/version_util.cc +96 -0
  145. package/deps/rocksdb/rocksdb/db/version_util.h +24 -0
  146. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +5 -5
  147. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +647 -31
  148. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +219 -1
  149. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +549 -12
  150. package/deps/rocksdb/rocksdb/db/write_callback_test.cc +3 -3
  151. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +1 -1
  152. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +19 -0
  153. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +21 -4
  154. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h +32 -0
  155. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +74 -22
  156. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +9 -0
  157. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +143 -61
  158. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +15 -2
  159. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +76 -2
  160. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +92 -72
  161. package/deps/rocksdb/rocksdb/env/env.cc +1 -0
  162. package/deps/rocksdb/rocksdb/env/env_test.cc +365 -2
  163. package/deps/rocksdb/rocksdb/env/fs_posix.cc +31 -30
  164. package/deps/rocksdb/rocksdb/env/io_posix.cc +8 -11
  165. package/deps/rocksdb/rocksdb/env/io_posix.h +30 -1
  166. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  167. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +1 -1
  168. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +108 -0
  169. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +32 -4
  170. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +4 -4
  171. package/deps/rocksdb/rocksdb/file/file_util.cc +8 -2
  172. package/deps/rocksdb/rocksdb/file/file_util.h +2 -1
  173. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +331 -12
  174. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +52 -35
  175. package/deps/rocksdb/rocksdb/folly.mk +22 -5
  176. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +1 -1
  177. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +100 -54
  178. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +67 -2
  179. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +149 -13
  180. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +1 -12
  181. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +78 -97
  182. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +3 -3
  183. package/deps/rocksdb/rocksdb/include/rocksdb/external_table.h +2 -2
  184. package/deps/rocksdb/rocksdb/include/rocksdb/file_checksum.h +5 -0
  185. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +17 -2
  186. package/deps/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h +1 -1
  187. package/deps/rocksdb/rocksdb/include/rocksdb/io_dispatcher.h +358 -0
  188. package/deps/rocksdb/rocksdb/include/rocksdb/iostats_context.h +13 -0
  189. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +43 -0
  190. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +20 -0
  191. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +63 -21
  192. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +10 -1
  193. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +1 -1
  194. package/deps/rocksdb/rocksdb/include/rocksdb/slice_transform.h +2 -7
  195. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_reader.h +13 -0
  196. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +3 -14
  197. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +49 -9
  198. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +8 -0
  199. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +77 -6
  200. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +15 -0
  201. package/deps/rocksdb/rocksdb/include/rocksdb/tool_hooks.h +16 -10
  202. package/deps/rocksdb/rocksdb/include/rocksdb/unique_id.h +5 -5
  203. package/deps/rocksdb/rocksdb/include/rocksdb/universal_compaction.h +2 -4
  204. package/deps/rocksdb/rocksdb/include/rocksdb/user_defined_index.h +106 -46
  205. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h +1 -1
  206. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +14 -1
  207. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/memory_util.h +5 -1
  208. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h +2 -1
  209. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -9
  210. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  211. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +1 -2
  212. package/deps/rocksdb/rocksdb/memory/memory_allocator_test.cc +2 -2
  213. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +226 -8
  214. package/deps/rocksdb/rocksdb/memtable/inlineskiplist_test.cc +490 -0
  215. package/deps/rocksdb/rocksdb/memtable/skiplist.h +3 -3
  216. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +11 -0
  217. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +4 -12
  218. package/deps/rocksdb/rocksdb/microbench/ribbon_bench.cc +5 -5
  219. package/deps/rocksdb/rocksdb/monitoring/file_read_sample.h +21 -4
  220. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +9 -3
  221. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +21 -2
  222. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +2 -2
  223. package/deps/rocksdb/rocksdb/options/cf_options.cc +21 -1
  224. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  225. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -2
  226. package/deps/rocksdb/rocksdb/options/db_options.cc +26 -5
  227. package/deps/rocksdb/rocksdb/options/db_options.h +3 -1
  228. package/deps/rocksdb/rocksdb/options/options.cc +5 -1
  229. package/deps/rocksdb/rocksdb/options/options_helper.cc +7 -2
  230. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +109 -103
  231. package/deps/rocksdb/rocksdb/options/options_test.cc +14 -0
  232. package/deps/rocksdb/rocksdb/port/jemalloc_helper.h +15 -17
  233. package/deps/rocksdb/rocksdb/port/lang.h +4 -0
  234. package/deps/rocksdb/rocksdb/port/port_example.h +0 -23
  235. package/deps/rocksdb/rocksdb/port/stack_trace.cc +36 -0
  236. package/deps/rocksdb/rocksdb/port/stack_trace.h +9 -0
  237. package/deps/rocksdb/rocksdb/src.mk +12 -0
  238. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +1 -2
  239. package/deps/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc +2 -1
  240. package/deps/rocksdb/rocksdb/table/block_based/block.cc +571 -292
  241. package/deps/rocksdb/rocksdb/table/block_based/block.h +143 -53
  242. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +154 -90
  243. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +5 -1
  244. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +51 -14
  245. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.h +0 -2
  246. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +147 -734
  247. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +30 -233
  248. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +178 -108
  249. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +13 -0
  250. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +17 -4
  251. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -2
  252. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +70 -0
  253. package/deps/rocksdb/rocksdb/table/block_based/block_builder.cc +168 -24
  254. package/deps/rocksdb/rocksdb/table/block_based/block_builder.h +25 -9
  255. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +7 -4
  256. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +9 -2
  257. package/deps/rocksdb/rocksdb/table/block_based/block_test.cc +548 -169
  258. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +30 -0
  259. package/deps/rocksdb/rocksdb/table/block_based/block_util.h +156 -0
  260. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.cc +73 -30
  261. package/deps/rocksdb/rocksdb/table/block_based/data_block_footer.h +74 -7
  262. package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index.h +1 -1
  263. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +20 -14
  264. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +22 -12
  265. package/deps/rocksdb/rocksdb/table/block_based/mock_block_based_table.h +1 -1
  266. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.cc +332 -0
  267. package/deps/rocksdb/rocksdb/table/block_based/multi_scan_index_iterator.h +133 -0
  268. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +4 -2
  269. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +1 -1
  270. package/deps/rocksdb/rocksdb/table/block_based/reader_common.cc +3 -2
  271. package/deps/rocksdb/rocksdb/table/block_based/reader_common.h +4 -1
  272. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h +0 -1
  273. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +126 -46
  274. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +31 -3
  275. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +1 -2
  276. package/deps/rocksdb/rocksdb/table/cleanable_test.cc +3 -1
  277. package/deps/rocksdb/rocksdb/table/external_table.cc +25 -4
  278. package/deps/rocksdb/rocksdb/table/format.cc +27 -15
  279. package/deps/rocksdb/rocksdb/table/format.h +41 -15
  280. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +1 -0
  281. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +22 -12
  282. package/deps/rocksdb/rocksdb/table/meta_blocks.h +0 -1
  283. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +7 -21
  284. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +0 -1
  285. package/deps/rocksdb/rocksdb/table/sst_file_reader.cc +88 -13
  286. package/deps/rocksdb/rocksdb/table/sst_file_reader_test.cc +53 -42
  287. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +3 -12
  288. package/deps/rocksdb/rocksdb/table/table_builder.h +0 -4
  289. package/deps/rocksdb/rocksdb/table/table_properties.cc +18 -0
  290. package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -3
  291. package/deps/rocksdb/rocksdb/table/table_test.cc +848 -172
  292. package/deps/rocksdb/rocksdb/table/unique_id.cc +24 -20
  293. package/deps/rocksdb/rocksdb/table/unique_id_impl.h +8 -8
  294. package/deps/rocksdb/rocksdb/test_util/sync_point.h +5 -4
  295. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -1
  296. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -2
  297. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +2 -1
  298. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +238 -120
  299. package/deps/rocksdb/rocksdb/tools/db_repl_stress.cc +2 -2
  300. package/deps/rocksdb/rocksdb/tools/db_sanity_test.cc +2 -4
  301. package/deps/rocksdb/rocksdb/tools/dump/db_dump_tool.cc +4 -8
  302. package/deps/rocksdb/rocksdb/tools/dump/rocksdb_undump.cc +1 -1
  303. package/deps/rocksdb/rocksdb/tools/io_tracer_parser_test.cc +2 -3
  304. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +82 -20
  305. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +41 -47
  306. package/deps/rocksdb/rocksdb/tools/ldb_tool.cc +9 -0
  307. package/deps/rocksdb/rocksdb/tools/reduce_levels_test.cc +5 -6
  308. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +1 -1
  309. package/deps/rocksdb/rocksdb/tools/tool_hooks.cc +6 -5
  310. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +4 -4
  311. package/deps/rocksdb/rocksdb/tools/write_stress.cc +1 -3
  312. package/deps/rocksdb/rocksdb/util/atomic.h +30 -23
  313. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +6 -7
  314. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +3 -3
  315. package/deps/rocksdb/rocksdb/util/bit_fields.h +68 -46
  316. package/deps/rocksdb/rocksdb/util/bloom_impl.h +16 -16
  317. package/deps/rocksdb/rocksdb/util/coding.h +14 -27
  318. package/deps/rocksdb/rocksdb/util/compression.cc +365 -207
  319. package/deps/rocksdb/rocksdb/util/compression.h +16 -1298
  320. package/deps/rocksdb/rocksdb/util/compression_test.cc +347 -61
  321. package/deps/rocksdb/rocksdb/util/crc32c_arm64.cc +8 -9
  322. package/deps/rocksdb/rocksdb/util/crc32c_arm64.h +1 -1
  323. package/deps/rocksdb/rocksdb/util/crc32c_ppc.h +1 -1
  324. package/deps/rocksdb/rocksdb/util/dynamic_bloom_test.cc +3 -3
  325. package/deps/rocksdb/rocksdb/util/filter_bench.cc +18 -18
  326. package/deps/rocksdb/rocksdb/util/gflags_compat.h +3 -3
  327. package/deps/rocksdb/rocksdb/util/hash_test.cc +19 -7
  328. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.cc +1099 -0
  329. package/deps/rocksdb/rocksdb/util/io_dispatcher_imp.h +36 -0
  330. package/deps/rocksdb/rocksdb/util/io_dispatcher_test.cc +1919 -0
  331. package/deps/rocksdb/rocksdb/util/math.h +3 -1
  332. package/deps/rocksdb/rocksdb/util/mutexlock.h +19 -19
  333. package/deps/rocksdb/rocksdb/util/ribbon_alg.h +25 -25
  334. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +5 -7
  335. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -5
  336. package/deps/rocksdb/rocksdb/util/slice.cc +0 -10
  337. package/deps/rocksdb/rocksdb/util/slice_test.cc +35 -1
  338. package/deps/rocksdb/rocksdb/util/slice_transform_test.cc +5 -7
  339. package/deps/rocksdb/rocksdb/util/status.cc +3 -1
  340. package/deps/rocksdb/rocksdb/util/stop_watch.h +2 -0
  341. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -1
  342. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +123 -78
  343. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc +12 -93
  344. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h +1 -4
  345. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.cc +0 -21
  346. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db.h +6 -48
  347. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc +94 -307
  348. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h +12 -58
  349. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +2 -8
  350. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h +2 -3
  351. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +205 -811
  352. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc +18 -9
  353. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.cc +2 -7
  354. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_file.h +1 -9
  355. package/deps/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc +17 -11
  356. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.cc +1 -1
  357. package/deps/rocksdb/rocksdb/utilities/cassandra/test_utils.h +1 -1
  358. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -1
  359. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +68 -61
  360. package/deps/rocksdb/rocksdb/utilities/debug.cc +2 -1
  361. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +105 -59
  362. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +274 -7
  363. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs_test.cc +94 -0
  364. package/deps/rocksdb/rocksdb/utilities/memory/memory_test.cc +13 -17
  365. package/deps/rocksdb/rocksdb/utilities/memory/memory_util.cc +16 -3
  366. package/deps/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc +25 -25
  367. package/deps/rocksdb/rocksdb/utilities/object_registry.cc +40 -40
  368. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +2 -5
  369. package/deps/rocksdb/rocksdb/utilities/options/options_util_test.cc +17 -19
  370. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc +2 -2
  371. package/deps/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h +2 -2
  372. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc +1 -1
  373. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc +2 -2
  374. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h +4 -13
  375. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +3 -3
  376. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +6 -0
  377. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_seqno_test.cc +431 -0
  378. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +1 -2
  379. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.h +91 -0
  380. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.cc +562 -0
  381. package/deps/rocksdb/rocksdb/utilities/trie_index/bitvector.h +615 -0
  382. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.cc +2575 -0
  383. package/deps/rocksdb/rocksdb/utilities/trie_index/louds_trie.h +685 -0
  384. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_db_test.cc +2843 -0
  385. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.cc +567 -0
  386. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_factory.h +275 -0
  387. package/deps/rocksdb/rocksdb/utilities/trie_index/trie_index_test.cc +5183 -0
  388. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +4 -3
  389. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h +1 -1
  390. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +2 -2
  391. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +3 -3
  392. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc +93 -88
  393. package/deps/rocksdb/rocksdb.gyp +7 -0
  394. package/index.js +70 -10
  395. package/iterator.js +25 -3
  396. package/max_rev_operator.h +9 -5
  397. package/package.json +1 -1
  398. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  399. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
  400. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +0 -43
  401. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h +0 -55
@@ -0,0 +1,2843 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // This source code is licensed under both the GPLv2 (found in the
3
+ // COPYING file in the root directory) and Apache 2.0 License
4
+ // (found in the LICENSE.Apache file in the root directory).
5
+
6
+ // DB-level tests for the trie-based User Defined Index (UDI). Validates that
7
+ // a DB opened with the trie UDI factory correctly handles all operation types
8
+ // (Put, Delete, Merge, SingleDelete, PutEntity, TimedPut, DeleteRange) through
9
+ // flush and compaction, and that the resulting SST files are readable with
10
+ // correct data.
11
+ //
12
+ // These tests complement the SST-level tests in trie_index_test.cc (which use
13
+ // SstFileWriter and are limited to Put/Delete/Merge) by exercising the full
14
+ // DB path including CompactionIterator, memtable flush, and the UDI builder
15
+ // wrapper's ValueType mapping and kTypeValuePreferredSeqno handling.
16
+
17
+ #include <memory>
18
+ #include <string>
19
+ #include <vector>
20
+
21
+ #include "port/port.h"
22
+ #include "rocksdb/db.h"
23
+ #include "rocksdb/options.h"
24
+ #include "rocksdb/slice.h"
25
+ #include "rocksdb/sst_file_writer.h"
26
+ #include "rocksdb/status.h"
27
+ #include "rocksdb/table.h"
28
+ #include "rocksdb/utilities/transaction.h"
29
+ #include "rocksdb/utilities/transaction_db.h"
30
+ #include "rocksdb/wide_columns.h"
31
+ #include "rocksdb/write_batch.h"
32
+ #include "test_util/testharness.h"
33
+ #include "test_util/testutil.h"
34
+ #include "util/compression.h"
35
+ #include "util/random.h"
36
+ #include "utilities/merge_operators.h"
37
+ #include "utilities/trie_index/trie_index_factory.h"
38
+
39
+ namespace ROCKSDB_NAMESPACE {
40
+ namespace trie_index {
41
+
42
+ // Encodes an integer as an 8-byte big-endian key body, matching the pattern
43
+ // used by db_stress's test_batches_snapshots mode.
44
+ static std::string MakeKeyBody(int k) {
45
+ std::string key_body(8, '\0');
46
+ uint64_t val = static_cast<uint64_t>(k);
47
+ for (int i = 7; i >= 0; --i) {
48
+ key_body[i] = static_cast<char>(val & 0xff);
49
+ val >>= 8;
50
+ }
51
+ return key_body;
52
+ }
53
+
54
+ class TrieIndexDBTest : public testing::Test {
55
+ protected:
56
+ void SetUp() override {
57
+ trie_factory_ = std::make_shared<TrieIndexFactory>();
58
+ dbname_ = test::PerThreadDBPath("trie_index_db_test");
59
+ ASSERT_OK(DestroyDB(dbname_, Options()));
60
+ }
61
+
62
+ void TearDown() override {
63
+ if (db_) {
64
+ EXPECT_OK(db_->Close());
65
+ db_.reset();
66
+ }
67
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
68
+ }
69
+
70
+ // Opens a DB with the trie UDI factory configured. Caller should set
71
+ // options_ fields before calling this. An optional block_size overrides
72
+ // the default to force more data blocks in the SST.
73
+ Status OpenDB(int block_size = 0) {
74
+ options_.create_if_missing = true;
75
+ BlockBasedTableOptions table_options;
76
+ table_options.user_defined_index_factory = trie_factory_;
77
+ if (block_size > 0) {
78
+ table_options.block_size = block_size;
79
+ }
80
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
81
+ last_options_ = options_;
82
+ return DB::Open(options_, dbname_, &db_);
83
+ }
84
+
85
+ // Returns a ReadOptions that routes reads through the standard binary
86
+ // search index (the default when table_index_factory is null).
87
+ ReadOptions StandardIndexReadOptions() const { return ReadOptions(); }
88
+
89
+ // Returns a ReadOptions that routes reads through the trie UDI index.
90
+ ReadOptions TrieIndexReadOptions() const {
91
+ ReadOptions ro;
92
+ ro.table_index_factory = trie_factory_.get();
93
+ return ro;
94
+ }
95
+
96
+ // Collects all visible keys via forward scan using the given ReadOptions.
97
+ std::vector<std::string> ScanAllKeys(const ReadOptions& ro) {
98
+ std::vector<std::string> keys;
99
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
100
+ iter->SeekToFirst();
101
+ for (; iter->Valid(); iter->Next()) {
102
+ keys.push_back(iter->key().ToString());
103
+ }
104
+ EXPECT_OK(iter->status());
105
+ return keys;
106
+ }
107
+
108
+ // Collects all visible keys via forward scan using the standard index.
109
+ std::vector<std::string> ScanAllKeys() {
110
+ return ScanAllKeys(StandardIndexReadOptions());
111
+ }
112
+
113
+ // Collects all visible (key, value) pairs via forward scan.
114
+ std::vector<std::pair<std::string, std::string>> ScanAllKeyValues(
115
+ const ReadOptions& ro) {
116
+ std::vector<std::pair<std::string, std::string>> kvs;
117
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
118
+ iter->SeekToFirst();
119
+ for (; iter->Valid(); iter->Next()) {
120
+ kvs.emplace_back(iter->key().ToString(), iter->value().ToString());
121
+ }
122
+ EXPECT_OK(iter->status());
123
+ return kvs;
124
+ }
125
+
126
+ // Verifies that forward scan via SeekToFirst+Next produces the same key
127
+ // set through both the standard index and the trie index.
128
+ void VerifyForwardScanBothIndexes(
129
+ const std::vector<std::string>& expected_keys) {
130
+ {
131
+ SCOPED_TRACE("standard index");
132
+ ASSERT_EQ(ScanAllKeys(StandardIndexReadOptions()), expected_keys);
133
+ }
134
+ {
135
+ SCOPED_TRACE("trie index");
136
+ ASSERT_EQ(ScanAllKeys(TrieIndexReadOptions()), expected_keys);
137
+ }
138
+ }
139
+
140
+ // Verifies that forward scan via SeekToFirst+Next produces the same
141
+ // (key, value) pairs through both indexes.
142
+ void VerifyForwardScanBothIndexes(
143
+ const std::vector<std::pair<std::string, std::string>>& expected_kvs) {
144
+ {
145
+ SCOPED_TRACE("standard index");
146
+ ASSERT_EQ(ScanAllKeyValues(StandardIndexReadOptions()), expected_kvs);
147
+ }
148
+ {
149
+ SCOPED_TRACE("trie index");
150
+ ASSERT_EQ(ScanAllKeyValues(TrieIndexReadOptions()), expected_kvs);
151
+ }
152
+ }
153
+
154
+ // Verifies that a point Get returns the expected value through both indexes.
155
+ void VerifyGetBothIndexes(const std::string& key,
156
+ const std::string& expected_value) {
157
+ for (const auto& ro :
158
+ {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
159
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
160
+ std::string value;
161
+ ASSERT_OK(db_->Get(ro, key, &value));
162
+ ASSERT_EQ(value, expected_value);
163
+ }
164
+ }
165
+
166
+ // Verifies that a point Get returns NotFound through both indexes.
167
+ void VerifyGetNotFoundBothIndexes(const std::string& key) {
168
+ for (const auto& ro :
169
+ {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
170
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
171
+ std::string value;
172
+ ASSERT_TRUE(db_->Get(ro, key, &value).IsNotFound());
173
+ }
174
+ }
175
+
176
+ // Verifies Get with a snapshot through both indexes.
177
+ void VerifyGetBothIndexes(const Snapshot* snap, const std::string& key,
178
+ const std::string& expected_value) {
179
+ for (auto base_ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
180
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie index"
181
+ : "standard index");
182
+ base_ro.snapshot = snap;
183
+ std::string value;
184
+ ASSERT_OK(db_->Get(base_ro, key, &value));
185
+ ASSERT_EQ(value, expected_value);
186
+ }
187
+ }
188
+
189
+ // Verifies Get returns NotFound with a snapshot through both indexes.
190
+ void VerifyGetNotFoundBothIndexes(const Snapshot* snap,
191
+ const std::string& key) {
192
+ for (auto base_ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
193
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie index"
194
+ : "standard index");
195
+ base_ro.snapshot = snap;
196
+ std::string value;
197
+ ASSERT_TRUE(db_->Get(base_ro, key, &value).IsNotFound());
198
+ }
199
+ }
200
+
201
+ // Verifies that a forward scan with a snapshot produces the expected
202
+ // (key, value) pairs through both indexes.
203
+ void VerifyForwardScanBothIndexes(
204
+ const Snapshot* snap,
205
+ const std::vector<std::pair<std::string, std::string>>& expected_kvs) {
206
+ for (auto base_ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
207
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie index"
208
+ : "standard index");
209
+ base_ro.snapshot = snap;
210
+ ASSERT_EQ(ScanAllKeyValues(base_ro), expected_kvs);
211
+ }
212
+ }
213
+
214
+ // Verifies that Seek to a specific key through both indexes returns the
215
+ // same result.
216
+ void VerifySeekBothIndexes(const std::string& seek_key,
217
+ const std::string& expected_key,
218
+ const std::string& expected_value) {
219
+ for (const auto& ro :
220
+ {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
221
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
222
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
223
+ iter->Seek(seek_key);
224
+ ASSERT_TRUE(iter->Valid());
225
+ ASSERT_EQ(iter->key().ToString(), expected_key);
226
+ ASSERT_EQ(iter->value().ToString(), expected_value);
227
+ ASSERT_OK(iter->status());
228
+ }
229
+ }
230
+
231
+ // Verifies that Seek with a snapshot through both indexes returns the
232
+ // same result.
233
+ void VerifySeekBothIndexes(const Snapshot* snap, const std::string& seek_key,
234
+ const std::string& expected_key,
235
+ const std::string& expected_value) {
236
+ for (auto base_ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
237
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie index"
238
+ : "standard index");
239
+ base_ro.snapshot = snap;
240
+ std::unique_ptr<Iterator> iter(db_->NewIterator(base_ro));
241
+ iter->Seek(seek_key);
242
+ ASSERT_TRUE(iter->Valid());
243
+ ASSERT_EQ(iter->key().ToString(), expected_key);
244
+ ASSERT_EQ(iter->value().ToString(), expected_value);
245
+ ASSERT_OK(iter->status());
246
+ }
247
+ }
248
+
249
+ // Opens without UDI factory (standard index only). Used to test graceful
250
+ // degradation when reopening a DB that has UDI SSTs.
251
+ Status OpenDBWithoutUDI(int block_size = 0) {
252
+ options_.create_if_missing = true;
253
+ BlockBasedTableOptions table_options;
254
+ // Deliberately no user_defined_index_factory.
255
+ if (block_size > 0) {
256
+ table_options.block_size = block_size;
257
+ }
258
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
259
+ last_options_ = options_;
260
+ return DB::Open(options_, dbname_, &db_);
261
+ }
262
+
263
+ // Verify prefix-scan lockstep across `num_prefixes` iterators.
264
+ //
265
+ // Creates one iterator per prefix digit (0..num_prefixes-1), seeks each to
266
+ // its prefix, then walks all in lockstep asserting key bodies match. When
267
+ // `use_upper_bounds` is true, even-numbered iterators get an upper bound
268
+ // set to the next prefix. When `verify_values` is true, value bodies are
269
+ // also cross-checked.
270
+ //
271
+ // Returns the number of keys walked (per-prefix).
272
+ uint64_t VerifyPrefixScanLockstep(const ReadOptions& base_ro,
273
+ int num_prefixes, bool use_upper_bounds,
274
+ bool verify_values,
275
+ const std::string& trace_context = "") {
276
+ std::vector<std::unique_ptr<Iterator>> iters(num_prefixes);
277
+ std::vector<std::string> prefixes(num_prefixes);
278
+ std::vector<Slice> prefix_slices(num_prefixes);
279
+ std::vector<ReadOptions> ro_copies(num_prefixes);
280
+ std::vector<std::string> upper_bounds(num_prefixes);
281
+ std::vector<Slice> ub_slices(num_prefixes);
282
+
283
+ for (int d = 0; d < num_prefixes; ++d) {
284
+ prefixes[d] = std::to_string(d);
285
+ prefix_slices[d] = Slice(prefixes[d]);
286
+ ro_copies[d] = base_ro;
287
+ if (use_upper_bounds && d % 2 == 0) {
288
+ upper_bounds[d] = prefixes[d];
289
+ upper_bounds[d].back()++;
290
+ ub_slices[d] = upper_bounds[d];
291
+ ro_copies[d].iterate_upper_bound = &ub_slices[d];
292
+ }
293
+ iters[d].reset(db_->NewIterator(ro_copies[d]));
294
+ iters[d]->Seek(prefix_slices[d]);
295
+ }
296
+
297
+ uint64_t count = 0;
298
+ while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
299
+ count++;
300
+ std::vector<std::string> keys(num_prefixes);
301
+ std::vector<std::string> values(num_prefixes);
302
+ for (int d = 0; d < num_prefixes; ++d) {
303
+ EXPECT_TRUE(iters[d]->Valid())
304
+ << trace_context << " iter " << d << " invalid at step " << count;
305
+ EXPECT_TRUE(iters[d]->key().starts_with(prefix_slices[d]))
306
+ << trace_context << " iter " << d << " out of prefix at step "
307
+ << count;
308
+ if (!iters[d]->Valid()) {
309
+ return count;
310
+ }
311
+ keys[d] = iters[d]->key().ToString();
312
+ values[d] = iters[d]->value().ToString();
313
+ }
314
+
315
+ std::string key0_body = keys[0].substr(1);
316
+ for (int d = 1; d < num_prefixes; ++d) {
317
+ EXPECT_EQ(key0_body, keys[d].substr(1))
318
+ << trace_context << " key body mismatch at step " << count
319
+ << " iter " << d;
320
+ }
321
+
322
+ if (verify_values) {
323
+ std::string val0 = values[0];
324
+ if (!val0.empty()) {
325
+ val0.pop_back();
326
+ }
327
+ for (int d = 1; d < num_prefixes; ++d) {
328
+ std::string vald = values[d];
329
+ if (!vald.empty()) {
330
+ vald.pop_back();
331
+ }
332
+ EXPECT_EQ(val0, vald) << trace_context << " value mismatch at step "
333
+ << count << " iter " << d;
334
+ }
335
+ }
336
+
337
+ for (int d = 0; d < num_prefixes; ++d) {
338
+ iters[d]->Next();
339
+ }
340
+ }
341
+
342
+ EXPECT_OK(iters[0]->status());
343
+ for (int d = 1; d < num_prefixes; ++d) {
344
+ EXPECT_TRUE(!iters[d]->Valid() ||
345
+ !iters[d]->key().starts_with(prefix_slices[d]))
346
+ << trace_context << " iter " << d
347
+ << " still has keys after iter 0 finished";
348
+ EXPECT_OK(iters[d]->status());
349
+ }
350
+
351
+ return count;
352
+ }
353
+
354
+ std::shared_ptr<TrieIndexFactory> trie_factory_;
355
+ std::string dbname_;
356
+ Options options_;
357
+ Options last_options_;
358
+ std::unique_ptr<DB> db_;
359
+ };
360
+
361
+ // ============================================================================
362
+ // Flush tests
363
+ // ============================================================================
364
+
365
+ TEST_F(TrieIndexDBTest, FlushWithAllOperationTypes) {
366
+ // Write every supported operation type via the DB API, flush, and verify
367
+ // reads return correct results through both the standard binary search index
368
+ // and the trie UDI. This exercises the full path from memtable through
369
+ // CompactionIterator, BlockBasedTableBuilder, and the UDI builder wrapper's
370
+ // MapToUDIValueType for each internal ValueType.
371
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
372
+ options_.disable_auto_compactions = true;
373
+ ASSERT_OK(OpenDB());
374
+
375
+ // kTypeValue
376
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01_put", "val_put"));
377
+ // kTypeMerge
378
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_02_merge", "val_merge"));
379
+ // kTypeDeletion (bare tombstone — no prior value for this key)
380
+ ASSERT_OK(db_->Delete(WriteOptions(), "key_03_del"));
381
+ // kTypeSingleDeletion (preceded by a Put; both cancel out with no snapshot)
382
+ ASSERT_OK(db_->Put(WriteOptions(), "key_04_sdel", "val_sdel"));
383
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "key_04_sdel"));
384
+ // kTypeWideColumnEntity (with a default column so Get() returns a value)
385
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
386
+ "key_05_entity",
387
+ WideColumns{{"", "default_val"}, {"col1", "val1"}}));
388
+ // Another kTypeValue to anchor the end of the key range
389
+ ASSERT_OK(db_->Put(WriteOptions(), "key_06_put", "val_put2"));
390
+
391
+ ASSERT_OK(db_->Flush(FlushOptions()));
392
+
393
+ // Forward scan via both indexes. Expected visible keys after flush:
394
+ // key_01_put — Put (visible)
395
+ // key_02_merge — Merge single operand (visible)
396
+ // key_03_del — bare Delete tombstone (hidden by DBIter)
397
+ // key_04_sdel — Put + SingleDelete cancel out (hidden)
398
+ // key_05_entity — PutEntity (visible)
399
+ // key_06_put — Put (visible)
400
+ {
401
+ std::vector<std::string> expected = {"key_01_put", "key_02_merge",
402
+ "key_05_entity", "key_06_put"};
403
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
404
+ }
405
+
406
+ // Point lookups via both indexes.
407
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01_put", "val_put"));
408
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02_merge", "val_merge"));
409
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_03_del"));
410
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_04_sdel"));
411
+ // PutEntity: Get() returns the value of the default column ("").
412
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_05_entity", "default_val"));
413
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_06_put", "val_put2"));
414
+ }
415
+
416
+ TEST_F(TrieIndexDBTest, TimedPutFlush) {
417
+ // TimedPut produces kTypeValuePreferredSeqno entries during flush when
418
+ // preclude_last_level_data_seconds > 0. The UDI wrapper strips the packed
419
+ // preferred seqno suffix via ParsePackedValueForValue() before forwarding
420
+ // to the plugin builder. This test verifies that path end-to-end through
421
+ // both the standard binary search index and the trie UDI.
422
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
423
+ options_.compaction_style = kCompactionStyleUniversal;
424
+ // Required for kTypeValuePreferredSeqno to survive the flush path: the
425
+ // seqno_to_time_mapping must be available so a preferred seqno can be
426
+ // computed. With write_unix_time=0, GetProximalSeqnoBeforeTime(0) returns 0,
427
+ // which is < any real seqno, so the entry stays as kTypeValuePreferredSeqno.
428
+ options_.preclude_last_level_data_seconds = 10000;
429
+ options_.disable_auto_compactions = true;
430
+ ASSERT_OK(OpenDB());
431
+
432
+ // Regular Put alongside the TimedPut to verify they coexist.
433
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01_put", "val_put"));
434
+
435
+ // TimedPut via WriteBatch (there is no DB::TimedPut method).
436
+ {
437
+ WriteBatch wb;
438
+ ASSERT_OK(wb.TimedPut(db_->DefaultColumnFamily(), "key_02_timed",
439
+ "val_timed", /*write_unix_time=*/0));
440
+ ASSERT_OK(db_->Write(WriteOptions(), &wb));
441
+ }
442
+
443
+ // Merge to verify mixed types work with TimedPut in the same flush.
444
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_03_merge", "val_merge"));
445
+
446
+ ASSERT_OK(db_->Flush(FlushOptions()));
447
+
448
+ // Point lookups via both indexes — the packed seqno must be transparent.
449
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01_put", "val_put"));
450
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02_timed", "val_timed"));
451
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_03_merge", "val_merge"));
452
+
453
+ // Forward scan via both indexes — all three keys visible in order.
454
+ {
455
+ std::vector<std::pair<std::string, std::string>> expected = {
456
+ {"key_01_put", "val_put"},
457
+ {"key_02_timed", "val_timed"},
458
+ {"key_03_merge", "val_merge"}};
459
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
460
+ }
461
+ }
462
+
463
+ // ============================================================================
464
+ // Compaction tests
465
+ // ============================================================================
466
+
467
+ TEST_F(TrieIndexDBTest, CompactionWithMixedOpsAndSnapshots) {
468
+ // Multiple flushes followed by compaction with a snapshot held. The snapshot
469
+ // forces compaction to preserve multiple versions of the same user key,
470
+ // exercising the UDI builder's handling of duplicate user keys with different
471
+ // sequence numbers and value types. Verified through both indexes.
472
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
473
+ options_.disable_auto_compactions = true;
474
+ ASSERT_OK(OpenDB());
475
+
476
+ // Flush 1: initial values.
477
+ ASSERT_OK(db_->Put(WriteOptions(), "key_aa", "v1"));
478
+ ASSERT_OK(db_->Put(WriteOptions(), "key_bb", "v1"));
479
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_cc", "m1"));
480
+ ASSERT_OK(db_->Flush(FlushOptions()));
481
+
482
+ // Snapshot pins flush 1 versions so compaction preserves them.
483
+ const Snapshot* snap = db_->GetSnapshot();
484
+
485
+ // Flush 2: updates that create new versions.
486
+ ASSERT_OK(db_->Put(WriteOptions(), "key_aa", "v2"));
487
+ ASSERT_OK(db_->Delete(WriteOptions(), "key_bb"));
488
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_cc", "m2"));
489
+ ASSERT_OK(db_->Flush(FlushOptions()));
490
+
491
+ // Compact all levels. Both versions of each key are preserved because the
492
+ // snapshot prevents garbage collection of the older versions.
493
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
494
+
495
+ // Current view (no snapshot): key_aa=v2, key_bb deleted, key_cc="m1,m2".
496
+ {
497
+ std::vector<std::pair<std::string, std::string>> expected = {
498
+ {"key_aa", "v2"}, {"key_cc", "m1,m2"}};
499
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
500
+ }
501
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_aa", "v2"));
502
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_bb"));
503
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_cc", "m1,m2"));
504
+
505
+ // Snapshot view: key_aa=v1, key_bb=v1, key_cc="m1".
506
+ {
507
+ std::vector<std::pair<std::string, std::string>> expected = {
508
+ {"key_aa", "v1"}, {"key_bb", "v1"}, {"key_cc", "m1"}};
509
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(snap, expected));
510
+ }
511
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_aa", "v1"));
512
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_bb", "v1"));
513
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_cc", "m1"));
514
+
515
+ db_->ReleaseSnapshot(snap);
516
+ }
517
+
518
+ TEST_F(TrieIndexDBTest, CompactionWithAllOperationTypes) {
519
+ // Exercises all operation types (Put, Delete, Merge, SingleDelete, PutEntity)
520
+ // across two flushes with a snapshot, then compacts. Verified through both
521
+ // indexes. This ensures the UDI builder handles the full range of value types
522
+ // in compaction output, and that both the current and snapshot views are
523
+ // correct.
524
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
525
+ options_.disable_auto_compactions = true;
526
+ ASSERT_OK(OpenDB());
527
+
528
+ // Flush 1: initial values with diverse types.
529
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01_put", "v1"));
530
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_02_merge", "m1"));
531
+ ASSERT_OK(db_->Put(WriteOptions(), "key_03_sd_target", "sd_val"));
532
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
533
+ "key_04_entity", WideColumns{{"", "e1"}}));
534
+ ASSERT_OK(db_->Put(WriteOptions(), "key_05_del_target", "del_val"));
535
+ ASSERT_OK(db_->Flush(FlushOptions()));
536
+
537
+ // Snapshot pins flush 1 versions.
538
+ const Snapshot* snap = db_->GetSnapshot();
539
+
540
+ // Flush 2: updates each key with a different operation type.
541
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01_put", "v2"));
542
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_02_merge", "m2"));
543
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "key_03_sd_target"));
544
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
545
+ "key_04_entity", WideColumns{{"", "e2"}}));
546
+ ASSERT_OK(db_->Delete(WriteOptions(), "key_05_del_target"));
547
+ ASSERT_OK(db_->Flush(FlushOptions()));
548
+
549
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
550
+
551
+ // Current view via both indexes: key_01=v2, key_02="m1,m2", key_03 SD'd,
552
+ // key_04=e2, key_05 deleted.
553
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01_put", "v2"));
554
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02_merge", "m1,m2"));
555
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_03_sd_target"));
556
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_04_entity", "e2"));
557
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_05_del_target"));
558
+
559
+ // Current view scan via both indexes: only key_01, key_02, key_04 visible.
560
+ {
561
+ std::vector<std::string> expected = {"key_01_put", "key_02_merge",
562
+ "key_04_entity"};
563
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
564
+ }
565
+
566
+ // Snapshot view via both indexes: all original flush 1 values visible.
567
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_01_put", "v1"));
568
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_02_merge", "m1"));
569
+ ASSERT_NO_FATAL_FAILURE(
570
+ VerifyGetBothIndexes(snap, "key_03_sd_target", "sd_val"));
571
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_04_entity", "e1"));
572
+ ASSERT_NO_FATAL_FAILURE(
573
+ VerifyGetBothIndexes(snap, "key_05_del_target", "del_val"));
574
+
575
+ {
576
+ std::vector<std::pair<std::string, std::string>> expected = {
577
+ {"key_01_put", "v1"},
578
+ {"key_02_merge", "m1"},
579
+ {"key_03_sd_target", "sd_val"},
580
+ {"key_04_entity", "e1"},
581
+ {"key_05_del_target", "del_val"}};
582
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(snap, expected));
583
+ }
584
+
585
+ db_->ReleaseSnapshot(snap);
586
+ }
587
+
588
+ TEST_F(TrieIndexDBTest, TimedPutCompaction) {
589
+ // Verifies that kTypeValuePreferredSeqno entries survive compaction and the
590
+ // UDI builder correctly strips the packed seqno during compaction output.
591
+ // Verified through both indexes.
592
+ options_.compaction_style = kCompactionStyleUniversal;
593
+ options_.preclude_last_level_data_seconds = 10000;
594
+ options_.disable_auto_compactions = true;
595
+ ASSERT_OK(OpenDB());
596
+
597
+ // Flush 1: TimedPut + regular Put.
598
+ {
599
+ WriteBatch wb;
600
+ ASSERT_OK(wb.TimedPut(db_->DefaultColumnFamily(), "key_01_timed",
601
+ "timed_v1", /*write_unix_time=*/0));
602
+ ASSERT_OK(db_->Write(WriteOptions(), &wb));
603
+ }
604
+ ASSERT_OK(db_->Put(WriteOptions(), "key_02_put", "put_v1"));
605
+ ASSERT_OK(db_->Flush(FlushOptions()));
606
+
607
+ // Snapshot pins flush 1 versions.
608
+ const Snapshot* snap = db_->GetSnapshot();
609
+
610
+ // Flush 2: overwrite both keys with regular Puts.
611
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01_timed", "put_v2"));
612
+ ASSERT_OK(db_->Put(WriteOptions(), "key_02_put", "put_v2"));
613
+ ASSERT_OK(db_->Flush(FlushOptions()));
614
+
615
+ // Compact: the snapshot forces both versions of key_01_timed to be kept.
616
+ // The older version is kTypeValuePreferredSeqno with packed seqno.
617
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
618
+
619
+ // Current view via both indexes: both keys have the newer value.
620
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01_timed", "put_v2"));
621
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02_put", "put_v2"));
622
+ {
623
+ std::vector<std::pair<std::string, std::string>> expected = {
624
+ {"key_01_timed", "put_v2"}, {"key_02_put", "put_v2"}};
625
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
626
+ }
627
+
628
+ // Snapshot view via both indexes: key_01 has the original TimedPut value
629
+ // (packed seqno must be transparent), key_02 has its original value.
630
+ ASSERT_NO_FATAL_FAILURE(
631
+ VerifyGetBothIndexes(snap, "key_01_timed", "timed_v1"));
632
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "key_02_put", "put_v1"));
633
+ {
634
+ std::vector<std::pair<std::string, std::string>> expected = {
635
+ {"key_01_timed", "timed_v1"}, {"key_02_put", "put_v1"}};
636
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(snap, expected));
637
+ }
638
+
639
+ db_->ReleaseSnapshot(snap);
640
+ }
641
+
642
+ TEST_F(TrieIndexDBTest, CrossFlushSingleDelete) {
643
+ // Verifies that a SingleDelete in a later SST correctly cancels a Put from
644
+ // an earlier SST after compaction with the trie UDI active. Verified through
645
+ // both indexes.
646
+ options_.disable_auto_compactions = true;
647
+ ASSERT_OK(OpenDB());
648
+
649
+ // Flush 1: Puts.
650
+ ASSERT_OK(db_->Put(WriteOptions(), "key_aa", "val_aa"));
651
+ ASSERT_OK(db_->Put(WriteOptions(), "key_bb", "val_bb"));
652
+ ASSERT_OK(db_->Put(WriteOptions(), "key_cc", "val_cc"));
653
+ ASSERT_OK(db_->Flush(FlushOptions()));
654
+
655
+ // Flush 2: SingleDelete key_bb (targets the Put from flush 1).
656
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "key_bb"));
657
+ ASSERT_OK(db_->Flush(FlushOptions()));
658
+
659
+ // Before compaction: key_bb is already hidden by the merging iterator.
660
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_bb"));
661
+
662
+ // After compaction: SingleDelete + Put fully cancel out, key_bb is gone.
663
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
664
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_bb"));
665
+
666
+ // Remaining keys unaffected via both indexes.
667
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_aa", "val_aa"));
668
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_cc", "val_cc"));
669
+
670
+ {
671
+ std::vector<std::string> expected = {"key_aa", "key_cc"};
672
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
673
+ }
674
+ }
675
+
676
+ // ============================================================================
677
+ // Iteration tests
678
+ // ============================================================================
679
+
680
+ TEST_F(TrieIndexDBTest, ReverseIteration) {
681
+ // Verifies that reverse iteration (SeekToLast, Prev, SeekForPrev) works
682
+ // correctly with mixed operation types. Forward scan and point lookups are
683
+ // verified through both indexes. Reverse operations use the standard index
684
+ // (the trie UDI iterator does not yet support SeekToLast/Prev/SeekForPrev).
685
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
686
+ options_.disable_auto_compactions = true;
687
+ ASSERT_OK(OpenDB());
688
+
689
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01", "v1"));
690
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_02", "m1"));
691
+ ASSERT_OK(db_->Delete(WriteOptions(), "key_03"));
692
+ ASSERT_OK(db_->Put(WriteOptions(), "key_04", "v4"));
693
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "key_05",
694
+ WideColumns{{"", "e5"}}));
695
+ ASSERT_OK(db_->Put(WriteOptions(), "key_06", "v6"));
696
+
697
+ ASSERT_OK(db_->Flush(FlushOptions()));
698
+
699
+ // Visible keys: key_01, key_02, key_04, key_05, key_06 (key_03 deleted).
700
+
701
+ // Forward scan via both indexes.
702
+ {
703
+ std::vector<std::pair<std::string, std::string>> expected = {
704
+ {"key_01", "v1"},
705
+ {"key_02", "m1"},
706
+ {"key_04", "v4"},
707
+ {"key_05", "e5"},
708
+ {"key_06", "v6"}};
709
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
710
+ }
711
+
712
+ // Point lookups via both indexes.
713
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01", "v1"));
714
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02", "m1"));
715
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_03"));
716
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_04", "v4"));
717
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_05", "e5"));
718
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_06", "v6"));
719
+
720
+ // Seek via both indexes.
721
+ ASSERT_NO_FATAL_FAILURE(VerifySeekBothIndexes("key_04", "key_04", "v4"));
722
+ ASSERT_NO_FATAL_FAILURE(VerifySeekBothIndexes("key_05", "key_05", "e5"));
723
+
724
+ // Reverse operations below use the standard index only.
725
+
726
+ // SeekToLast + full reverse scan.
727
+ {
728
+ ReadOptions ro;
729
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
730
+ iter->SeekToLast();
731
+ std::vector<std::string> reverse_keys;
732
+ for (; iter->Valid(); iter->Prev()) {
733
+ reverse_keys.push_back(iter->key().ToString());
734
+ }
735
+ ASSERT_OK(iter->status());
736
+ std::vector<std::string> expected = {"key_06", "key_05", "key_04", "key_02",
737
+ "key_01"};
738
+ ASSERT_EQ(reverse_keys, expected);
739
+ }
740
+
741
+ // SeekForPrev to an exact visible key.
742
+ {
743
+ ReadOptions ro;
744
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
745
+ iter->SeekForPrev("key_04");
746
+ ASSERT_TRUE(iter->Valid());
747
+ ASSERT_EQ(iter->key().ToString(), "key_04");
748
+ ASSERT_EQ(iter->value().ToString(), "v4");
749
+ ASSERT_OK(iter->status());
750
+ }
751
+
752
+ // SeekForPrev to a deleted key — should land on the largest visible key
753
+ // that is <= "key_03", which is key_02.
754
+ {
755
+ ReadOptions ro;
756
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
757
+ iter->SeekForPrev("key_03");
758
+ ASSERT_TRUE(iter->Valid());
759
+ ASSERT_EQ(iter->key().ToString(), "key_02");
760
+ ASSERT_OK(iter->status());
761
+ }
762
+
763
+ // SeekForPrev to a key between existing keys.
764
+ {
765
+ ReadOptions ro;
766
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
767
+ iter->SeekForPrev("key_04_5");
768
+ ASSERT_TRUE(iter->Valid());
769
+ ASSERT_EQ(iter->key().ToString(), "key_04");
770
+ ASSERT_OK(iter->status());
771
+ }
772
+
773
+ // SeekForPrev before all keys — should be invalid.
774
+ {
775
+ ReadOptions ro;
776
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
777
+ iter->SeekForPrev("key_00");
778
+ ASSERT_FALSE(iter->Valid());
779
+ ASSERT_OK(iter->status());
780
+ }
781
+
782
+ // Prev from a Seek position in the middle of the range.
783
+ {
784
+ ReadOptions ro;
785
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
786
+ iter->Seek("key_05");
787
+ ASSERT_TRUE(iter->Valid());
788
+ ASSERT_EQ(iter->key().ToString(), "key_05");
789
+
790
+ iter->Prev();
791
+ ASSERT_TRUE(iter->Valid());
792
+ ASSERT_EQ(iter->key().ToString(), "key_04");
793
+
794
+ iter->Prev();
795
+ ASSERT_TRUE(iter->Valid());
796
+ ASSERT_EQ(iter->key().ToString(), "key_02");
797
+
798
+ iter->Prev();
799
+ ASSERT_TRUE(iter->Valid());
800
+ ASSERT_EQ(iter->key().ToString(), "key_01");
801
+
802
+ iter->Prev();
803
+ ASSERT_FALSE(iter->Valid());
804
+ ASSERT_OK(iter->status());
805
+ }
806
+ }
807
+
808
+ // ============================================================================
809
+ // DeleteRange test
810
+ // ============================================================================
811
+
812
+ TEST_F(TrieIndexDBTest, DeleteRangeWithTrieUDI) {
813
+ // Verifies that DeleteRange (kTypeRangeDeletion) works correctly alongside
814
+ // the trie UDI. Range deletions go to a separate range_del_block (not
815
+ // through OnKeyAdded), but we verify that reads correctly filter out
816
+ // range-deleted keys when the trie UDI is active. Forward scan and point
817
+ // lookups verified through both indexes; reverse scan uses standard index.
818
+ options_.disable_auto_compactions = true;
819
+ ASSERT_OK(OpenDB());
820
+
821
+ for (int i = 1; i <= 10; i++) {
822
+ char key_buf[16];
823
+ char val_buf[16];
824
+ snprintf(key_buf, sizeof(key_buf), "key_%02d", i);
825
+ snprintf(val_buf, sizeof(val_buf), "val_%02d", i);
826
+ ASSERT_OK(db_->Put(WriteOptions(), key_buf, val_buf));
827
+ }
828
+
829
+ // DeleteRange [key_04, key_08) — deletes key_04 through key_07.
830
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
831
+ "key_04", "key_08"));
832
+
833
+ ASSERT_OK(db_->Flush(FlushOptions()));
834
+
835
+ // Forward scan via both indexes: key_01..key_03 and key_08..key_10 visible.
836
+ {
837
+ std::vector<std::string> expected = {"key_01", "key_02", "key_03",
838
+ "key_08", "key_09", "key_10"};
839
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
840
+ }
841
+
842
+ // Point lookups via both indexes for deleted keys.
843
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_04"));
844
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_07"));
845
+
846
+ // Point lookups via both indexes for surviving keys at boundaries.
847
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_03", "val_03"));
848
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_08", "val_08"));
849
+
850
+ // Reverse scan (standard index only) should also respect the range deletion.
851
+ {
852
+ ReadOptions ro;
853
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
854
+ iter->SeekToLast();
855
+ std::vector<std::string> reverse_keys;
856
+ for (; iter->Valid(); iter->Prev()) {
857
+ reverse_keys.push_back(iter->key().ToString());
858
+ }
859
+ ASSERT_OK(iter->status());
860
+ std::vector<std::string> expected = {"key_10", "key_09", "key_08",
861
+ "key_03", "key_02", "key_01"};
862
+ ASSERT_EQ(reverse_keys, expected);
863
+ }
864
+ }
865
+
866
+ // ============================================================================
867
+ // DB reopen test
868
+ // ============================================================================
869
+
870
+ TEST_F(TrieIndexDBTest, ReopenWithMixedOperationTypes) {
871
+ // Writes all operation types, flushes, closes the DB, reopens, and verifies
872
+ // all data reads correctly from cold SST files through both indexes. This
873
+ // exercises the read path on a freshly opened DB where no memtable data
874
+ // exists.
875
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
876
+ options_.disable_auto_compactions = true;
877
+ ASSERT_OK(OpenDB());
878
+
879
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01", "val_put"));
880
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_02", "val_merge"));
881
+ ASSERT_OK(db_->Delete(WriteOptions(), "key_03"));
882
+ ASSERT_OK(db_->Put(WriteOptions(), "key_04", "sd_target"));
883
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), "key_04"));
884
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "key_05",
885
+ WideColumns{{"", "entity_val"}}));
886
+ ASSERT_OK(db_->Put(WriteOptions(), "key_06", "val_put2"));
887
+
888
+ ASSERT_OK(db_->Flush(FlushOptions()));
889
+
890
+ // Close the DB. All data is now only in SST files.
891
+ ASSERT_OK(db_->Close());
892
+ db_.reset();
893
+
894
+ // Reopen with the same trie UDI configuration.
895
+ ASSERT_OK(OpenDB());
896
+
897
+ // Point lookups on cold data via both indexes.
898
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01", "val_put"));
899
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02", "val_merge"));
900
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_03"));
901
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_04"));
902
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_05", "entity_val"));
903
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_06", "val_put2"));
904
+
905
+ // Forward scan via both indexes.
906
+ {
907
+ std::vector<std::string> expected = {"key_01", "key_02", "key_05",
908
+ "key_06"};
909
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
910
+ }
911
+
912
+ // Reverse scan on cold data (standard index only).
913
+ {
914
+ ReadOptions ro;
915
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
916
+ iter->SeekToLast();
917
+ std::vector<std::string> reverse_keys;
918
+ for (; iter->Valid(); iter->Prev()) {
919
+ reverse_keys.push_back(iter->key().ToString());
920
+ }
921
+ ASSERT_OK(iter->status());
922
+ std::vector<std::string> expected = {"key_06", "key_05", "key_02",
923
+ "key_01"};
924
+ ASSERT_EQ(reverse_keys, expected);
925
+ }
926
+ }
927
+
928
+ // ============================================================================
929
+ // Ingest external file test
930
+ // ============================================================================
931
+
932
+ TEST_F(TrieIndexDBTest, IngestExternalFileWithTrieUDI) {
933
+ // Creates an SST with SstFileWriter using the trie UDI, then ingests it
934
+ // into a live DB that also has trie UDI configured. Verifies that both the
935
+ // existing DB data and the ingested data are correctly readable through both
936
+ // indexes.
937
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
938
+ options_.disable_auto_compactions = true;
939
+ ASSERT_OK(OpenDB());
940
+
941
+ // Write some data directly to the DB and flush.
942
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01", "db_val1"));
943
+ ASSERT_OK(db_->Put(WriteOptions(), "key_05", "db_val5"));
944
+ ASSERT_OK(db_->Flush(FlushOptions()));
945
+
946
+ // Create an SST with SstFileWriter using trie UDI, containing mixed ops.
947
+ std::string sst_path = dbname_ + "/ingest.sst";
948
+ {
949
+ Options sst_options;
950
+ sst_options.merge_operator = MergeOperators::CreateStringAppendOperator();
951
+ BlockBasedTableOptions table_options;
952
+ table_options.user_defined_index_factory = trie_factory_;
953
+ sst_options.table_factory.reset(NewBlockBasedTableFactory(table_options));
954
+
955
+ SstFileWriter writer(EnvOptions(), sst_options);
956
+ ASSERT_OK(writer.Open(sst_path));
957
+ ASSERT_OK(writer.Put("key_02", "ingest_val2"));
958
+ ASSERT_OK(writer.Merge("key_03", "ingest_merge3"));
959
+ ASSERT_OK(writer.Delete("key_04"));
960
+ ASSERT_OK(writer.Put("key_06", "ingest_val6"));
961
+ ASSERT_OK(writer.Finish());
962
+ }
963
+
964
+ // Ingest into the live DB.
965
+ IngestExternalFileOptions ingest_opts;
966
+ ASSERT_OK(db_->IngestExternalFile({sst_path}, ingest_opts));
967
+
968
+ // Point lookups via both indexes — combined DB + ingested data.
969
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01", "db_val1"));
970
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02", "ingest_val2"));
971
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_03", "ingest_merge3"));
972
+ // key_04: ingested Delete tombstone, no prior value — NotFound.
973
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_04"));
974
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_05", "db_val5"));
975
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_06", "ingest_val6"));
976
+
977
+ // Forward scan via both indexes.
978
+ {
979
+ std::vector<std::string> expected = {"key_01", "key_02", "key_03", "key_05",
980
+ "key_06"};
981
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
982
+ }
983
+ }
984
+
985
+ // ============================================================================
986
+ // WriteBatch test
987
+ // ============================================================================
988
+
989
+ TEST_F(TrieIndexDBTest, WriteBatchWithMixedOperations) {
990
+ // Verifies that a single WriteBatch containing multiple operation types
991
+ // (Put, Delete, Merge, SingleDelete, PutEntity) works correctly with the
992
+ // trie UDI. Verified through both indexes. Real-world workloads typically
993
+ // batch multiple operations.
994
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
995
+ options_.disable_auto_compactions = true;
996
+ ASSERT_OK(OpenDB());
997
+
998
+ // Pre-populate a key that the batch's Delete will target.
999
+ ASSERT_OK(db_->Put(WriteOptions(), "key_02_del", "pre_val"));
1000
+ ASSERT_OK(db_->Flush(FlushOptions()));
1001
+
1002
+ // Build a WriteBatch with all operation types.
1003
+ WriteBatch wb;
1004
+ ASSERT_OK(wb.Put(db_->DefaultColumnFamily(), "key_01_put", "batch_put"));
1005
+ ASSERT_OK(wb.Delete(db_->DefaultColumnFamily(), "key_02_del"));
1006
+ ASSERT_OK(wb.Merge(db_->DefaultColumnFamily(), "key_03_merge", "batch_m"));
1007
+ // Put + SingleDelete within the same batch — they cancel out.
1008
+ ASSERT_OK(wb.Put(db_->DefaultColumnFamily(), "key_04_sd", "sd_target"));
1009
+ ASSERT_OK(wb.SingleDelete(db_->DefaultColumnFamily(), "key_04_sd"));
1010
+ ASSERT_OK(wb.PutEntity(db_->DefaultColumnFamily(), "key_05_entity",
1011
+ WideColumns{{"", "batch_entity"}}));
1012
+
1013
+ ASSERT_OK(db_->Write(WriteOptions(), &wb));
1014
+ ASSERT_OK(db_->Flush(FlushOptions()));
1015
+
1016
+ // Point lookups via both indexes. Expected visible keys: key_01 (Put),
1017
+ // key_03 (Merge), key_05 (PutEntity). key_02 deleted, key_04
1018
+ // Put+SingleDelete cancel.
1019
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01_put", "batch_put"));
1020
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_02_del"));
1021
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_03_merge", "batch_m"));
1022
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("key_04_sd"));
1023
+ ASSERT_NO_FATAL_FAILURE(
1024
+ VerifyGetBothIndexes("key_05_entity", "batch_entity"));
1025
+
1026
+ // Forward scan via both indexes.
1027
+ {
1028
+ std::vector<std::string> expected = {"key_01_put", "key_03_merge",
1029
+ "key_05_entity"};
1030
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
1031
+ }
1032
+ }
1033
+
1034
+ // ============================================================================
1035
+ // Large-scale test
1036
+ // ============================================================================
1037
+
1038
+ TEST_F(TrieIndexDBTest, LargeMixedOperationsAcrossBlocks) {
1039
+ // Large-scale test with many keys of different operation types and a small
1040
+ // block size. This stresses block boundary handling in the trie UDI across
1041
+ // Put, Delete, Merge, SingleDelete, and PutEntity entries. Verified through
1042
+ // both indexes.
1043
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
1044
+ options_.disable_auto_compactions = true;
1045
+ // Small block size forces many data blocks, exercising the trie index's
1046
+ // AddIndexEntry at frequent block boundaries.
1047
+ ASSERT_OK(OpenDB(/*block_size=*/128));
1048
+
1049
+ const int kNumKeys = 500;
1050
+ // Track keys expected to be visible after flush (non-deleted, non-SD'd).
1051
+ std::vector<std::string> expected_visible;
1052
+
1053
+ for (int i = 0; i < kNumKeys; i++) {
1054
+ char key_buf[32];
1055
+ snprintf(key_buf, sizeof(key_buf), "key_%06d", i);
1056
+ std::string key(key_buf);
1057
+
1058
+ // Distribute operation types:
1059
+ // i%10 in [0,3] -> Put (40%)
1060
+ // i%10 in [4,5] -> Delete (20%)
1061
+ // i%10 in [6,7] -> Merge (20%)
1062
+ // i%10 == 8 -> SingleDelete (10%, preceded by Put -- both cancel)
1063
+ // i%10 == 9 -> PutEntity (10%)
1064
+ int type = i % 10;
1065
+ if (type <= 3) {
1066
+ char val_buf[32];
1067
+ snprintf(val_buf, sizeof(val_buf), "val_%06d", i);
1068
+ ASSERT_OK(db_->Put(WriteOptions(), key, val_buf));
1069
+ expected_visible.push_back(key);
1070
+ } else if (type <= 5) {
1071
+ ASSERT_OK(db_->Delete(WriteOptions(), key));
1072
+ // Bare tombstone — not visible.
1073
+ } else if (type <= 7) {
1074
+ char val_buf[32];
1075
+ snprintf(val_buf, sizeof(val_buf), "merge_%06d", i);
1076
+ ASSERT_OK(db_->Merge(WriteOptions(), key, val_buf));
1077
+ expected_visible.push_back(key);
1078
+ } else if (type == 8) {
1079
+ ASSERT_OK(db_->Put(WriteOptions(), key, "to_be_deleted"));
1080
+ ASSERT_OK(db_->SingleDelete(WriteOptions(), key));
1081
+ // Put + SingleDelete cancel — not visible.
1082
+ } else {
1083
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), key,
1084
+ WideColumns{{"", "entity_val"}}));
1085
+ expected_visible.push_back(key);
1086
+ }
1087
+ }
1088
+
1089
+ ASSERT_OK(db_->Flush(FlushOptions()));
1090
+
1091
+ // Forward scan via both indexes — verify exactly the expected visible keys.
1092
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected_visible));
1093
+
1094
+ // Spot-check: Seek to every 10th visible key via both indexes.
1095
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1096
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1097
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1098
+ for (size_t i = 0; i < expected_visible.size(); i += 10) {
1099
+ iter->Seek(expected_visible[i]);
1100
+ ASSERT_TRUE(iter->Valid()) << "Seek failed for " << expected_visible[i];
1101
+ ASSERT_EQ(iter->key().ToString(), expected_visible[i]);
1102
+ }
1103
+ ASSERT_OK(iter->status());
1104
+ }
1105
+ }
1106
+
1107
+ // ============================================================================
1108
+ // Seqno side-table tests (same user key spanning data block boundaries)
1109
+ // ============================================================================
1110
+
1111
+ TEST_F(TrieIndexDBTest, SameUserKeyAcrossBlockBoundaries) {
1112
+ // Forces the same user key to appear in multiple data blocks by writing many
1113
+ // versions with snapshots held to prevent garbage collection, using a tiny
1114
+ // block_size. This exercises the trie's seqno side-table: the trie stores
1115
+ // only one separator per user key, and the side-table records the seqno +
1116
+ // overflow block count so that Seek() can find the correct data block for
1117
+ // each version.
1118
+ //
1119
+ // Without the seqno side-table fix (PR #14412), reads through the trie index
1120
+ // would return incorrect data when multiple versions of the same key span
1121
+ // different data blocks.
1122
+ options_.disable_auto_compactions = true;
1123
+ // Tiny block_size (64 bytes) forces each version of the key into its own
1124
+ // data block, creating same-user-key block boundaries that the trie must
1125
+ // handle via the seqno side-table.
1126
+ ASSERT_OK(OpenDB(/*block_size=*/64));
1127
+
1128
+ // Write multiple versions of the same key, holding snapshots so all versions
1129
+ // survive the flush to a single SST file.
1130
+ const std::string key = "same_key";
1131
+ constexpr int kNumVersions = 10;
1132
+ std::vector<const Snapshot*> snaps;
1133
+ for (int i = 0; i < kNumVersions; i++) {
1134
+ std::string val = "ver_" + std::to_string(i);
1135
+ ASSERT_OK(db_->Put(WriteOptions(), key, val));
1136
+ snaps.push_back(db_->GetSnapshot());
1137
+ }
1138
+
1139
+ ASSERT_OK(db_->Flush(FlushOptions()));
1140
+
1141
+ // Current view: latest version visible.
1142
+ ASSERT_NO_FATAL_FAILURE(
1143
+ VerifyGetBothIndexes(key, "ver_" + std::to_string(kNumVersions - 1)));
1144
+
1145
+ // Each snapshot should see the version written at or before its creation.
1146
+ for (int i = 0; i < kNumVersions; i++) {
1147
+ std::string expected_val = "ver_" + std::to_string(i);
1148
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snaps[i], key, expected_val));
1149
+ }
1150
+
1151
+ // Forward scan with each snapshot should return exactly one key with the
1152
+ // correct version.
1153
+ for (int i = 0; i < kNumVersions; i++) {
1154
+ std::string expected_val = "ver_" + std::to_string(i);
1155
+ std::vector<std::pair<std::string, std::string>> expected = {
1156
+ {key, expected_val}};
1157
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(snaps[i], expected));
1158
+ }
1159
+
1160
+ // Seek to the key through the trie index with each snapshot — the trie's
1161
+ // post-seek correction must advance through overflow blocks to find the
1162
+ // correct version for each seqno.
1163
+ for (int i = 0; i < kNumVersions; i++) {
1164
+ std::string expected_val = "ver_" + std::to_string(i);
1165
+ SCOPED_TRACE("snap=" + std::to_string(i));
1166
+ ASSERT_NO_FATAL_FAILURE(
1167
+ VerifySeekBothIndexes(snaps[i], key, key, expected_val));
1168
+ }
1169
+
1170
+ for (auto* snap : snaps) {
1171
+ db_->ReleaseSnapshot(snap);
1172
+ }
1173
+ }
1174
+
1175
+ TEST_F(TrieIndexDBTest, SameUserKeyPutThenDeleteAcrossBlocks) {
1176
+ // Same user key with a Put followed by a Delete, where both entries land in
1177
+ // different data blocks. A snapshot pins the Put version. After compaction,
1178
+ // the current view shows NotFound while the snapshot view shows the Put.
1179
+ // This tests the seqno side-table with mixed value types for the same key.
1180
+ options_.disable_auto_compactions = true;
1181
+ ASSERT_OK(OpenDB(/*block_size=*/64));
1182
+
1183
+ // Write a Put, take snapshot, then Delete.
1184
+ ASSERT_OK(db_->Put(WriteOptions(), "del_key", "put_value"));
1185
+ const Snapshot* snap = db_->GetSnapshot();
1186
+ ASSERT_OK(db_->Delete(WriteOptions(), "del_key"));
1187
+
1188
+ // Add surrounding keys to create more data blocks and exercise trie
1189
+ // separators around the duplicated key.
1190
+ ASSERT_OK(db_->Put(WriteOptions(), "aaa_before", "before_val"));
1191
+ ASSERT_OK(db_->Put(WriteOptions(), "zzz_after", "after_val"));
1192
+
1193
+ ASSERT_OK(db_->Flush(FlushOptions()));
1194
+
1195
+ // Current view: del_key is deleted.
1196
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("del_key"));
1197
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("aaa_before", "before_val"));
1198
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("zzz_after", "after_val"));
1199
+
1200
+ // Snapshot view: del_key is visible with the Put value.
1201
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "del_key", "put_value"));
1202
+
1203
+ // Seek to del_key with snapshot through both indexes.
1204
+ ASSERT_NO_FATAL_FAILURE(
1205
+ VerifySeekBothIndexes(snap, "del_key", "del_key", "put_value"));
1206
+
1207
+ // Compact to merge the Put + Delete. Snapshot prevents GC of the Put.
1208
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1209
+
1210
+ // After compaction, same behavior.
1211
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("del_key"));
1212
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snap, "del_key", "put_value"));
1213
+
1214
+ db_->ReleaseSnapshot(snap);
1215
+ }
1216
+
1217
+ TEST_F(TrieIndexDBTest, SameUserKeyManyVersionsSeekCorrectness) {
1218
+ // Writes many versions of three different keys (with snapshots), using a
1219
+ // tiny block_size to force same-user-key block boundaries. Verifies that
1220
+ // Seek + Get through the trie index returns the correct version for each
1221
+ // snapshot, testing the seqno side-table's overflow handling with multiple
1222
+ // keys interleaved in the SST.
1223
+ options_.disable_auto_compactions = true;
1224
+ ASSERT_OK(OpenDB(/*block_size=*/64));
1225
+
1226
+ const std::vector<std::string> keys = {"key_aaa", "key_mmm", "key_zzz"};
1227
+ constexpr int kVersionsPerKey = 8;
1228
+ // snaps[v] is taken after writing version v of all keys.
1229
+ std::vector<const Snapshot*> snaps;
1230
+
1231
+ for (int v = 0; v < kVersionsPerKey; v++) {
1232
+ for (const auto& k : keys) {
1233
+ std::string val = k + "_v" + std::to_string(v);
1234
+ ASSERT_OK(db_->Put(WriteOptions(), k, val));
1235
+ }
1236
+ snaps.push_back(db_->GetSnapshot());
1237
+ }
1238
+
1239
+ ASSERT_OK(db_->Flush(FlushOptions()));
1240
+
1241
+ // Verify each snapshot sees the correct version of each key via Get and Seek.
1242
+ for (int v = 0; v < kVersionsPerKey; v++) {
1243
+ for (const auto& k : keys) {
1244
+ std::string expected_val = k + "_v" + std::to_string(v);
1245
+ SCOPED_TRACE("key=" + k + " v=" + std::to_string(v));
1246
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snaps[v], k, expected_val));
1247
+ ASSERT_NO_FATAL_FAILURE(
1248
+ VerifySeekBothIndexes(snaps[v], k, k, expected_val));
1249
+ }
1250
+ }
1251
+
1252
+ // Compact and re-verify. Compaction must preserve all versions because
1253
+ // snapshots are held.
1254
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1255
+
1256
+ for (int v = 0; v < kVersionsPerKey; v++) {
1257
+ for (const auto& k : keys) {
1258
+ std::string expected_val = k + "_v" + std::to_string(v);
1259
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(snaps[v], k, expected_val));
1260
+ }
1261
+ }
1262
+
1263
+ for (auto* snap : snaps) {
1264
+ db_->ReleaseSnapshot(snap);
1265
+ }
1266
+ }
1267
+
1268
+ // ============================================================================
1269
+ // MultiGet test
1270
+ // ============================================================================
1271
+
1272
+ TEST_F(TrieIndexDBTest, MultiGetWithTrieUDI) {
1273
+ // Verifies that the batched MultiGet API works correctly with the trie UDI.
1274
+ // MultiGet is a separate code path from single Get and uses batched block
1275
+ // lookups, so it needs dedicated testing.
1276
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
1277
+ options_.disable_auto_compactions = true;
1278
+ ASSERT_OK(OpenDB(/*block_size=*/128));
1279
+
1280
+ // Write a mix of operation types.
1281
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01", "val_01"));
1282
+ ASSERT_OK(db_->Merge(WriteOptions(), "key_02", "merge_02"));
1283
+ ASSERT_OK(db_->Delete(WriteOptions(), "key_03"));
1284
+ ASSERT_OK(db_->Put(WriteOptions(), "key_04", "val_04"));
1285
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "key_05",
1286
+ WideColumns{{"", "entity_05"}}));
1287
+ ASSERT_OK(db_->Put(WriteOptions(), "key_06", "val_06"));
1288
+
1289
+ ASSERT_OK(db_->Flush(FlushOptions()));
1290
+
1291
+ // MultiGet through both indexes.
1292
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1293
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1294
+
1295
+ std::vector<Slice> mg_keys = {"key_01", "key_02", "key_03",
1296
+ "key_04", "key_05", "key_06",
1297
+ "key_nonexistent"};
1298
+ std::vector<std::string> mg_values(mg_keys.size());
1299
+ std::vector<Status> mg_statuses = db_->MultiGet(ro, mg_keys, &mg_values);
1300
+
1301
+ ASSERT_EQ(mg_statuses.size(), mg_keys.size());
1302
+ ASSERT_OK(mg_statuses[0]);
1303
+ ASSERT_EQ(mg_values[0], "val_01");
1304
+ ASSERT_OK(mg_statuses[1]);
1305
+ ASSERT_EQ(mg_values[1], "merge_02");
1306
+ ASSERT_TRUE(mg_statuses[2].IsNotFound());
1307
+ ASSERT_OK(mg_statuses[3]);
1308
+ ASSERT_EQ(mg_values[3], "val_04");
1309
+ ASSERT_OK(mg_statuses[4]);
1310
+ ASSERT_EQ(mg_values[4], "entity_05");
1311
+ ASSERT_OK(mg_statuses[5]);
1312
+ ASSERT_EQ(mg_values[5], "val_06");
1313
+ ASSERT_TRUE(mg_statuses[6].IsNotFound());
1314
+ }
1315
+ }
1316
+
1317
+ // ============================================================================
1318
+ // WAL replay / crash recovery test
1319
+ // ============================================================================
1320
+
1321
+ TEST_F(TrieIndexDBTest, WALReplayRecovery) {
1322
+ // Writes data without flushing, then closes and reopens the DB. The data
1323
+ // must be recovered from the WAL and then flushed. This tests that the trie
1324
+ // UDI builder handles entries replayed from the WAL correctly.
1325
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
1326
+ options_.disable_auto_compactions = true;
1327
+ // WAL is enabled by default (WriteOptions::disableWAL = false).
1328
+ ASSERT_OK(OpenDB());
1329
+
1330
+ // Write data — do NOT flush. Data lives only in the WAL + memtable.
1331
+ ASSERT_OK(db_->Put(WriteOptions(), "wal_key_01", "wal_val_01"));
1332
+ ASSERT_OK(db_->Merge(WriteOptions(), "wal_key_02", "wal_merge"));
1333
+ ASSERT_OK(db_->Put(WriteOptions(), "wal_key_03", "wal_val_03"));
1334
+ ASSERT_OK(db_->Delete(WriteOptions(), "wal_key_03"));
1335
+ ASSERT_OK(db_->Put(WriteOptions(), "wal_key_04", "wal_val_04"));
1336
+
1337
+ // Close and reopen — triggers WAL replay.
1338
+ ASSERT_OK(db_->Close());
1339
+ db_.reset();
1340
+ ASSERT_OK(OpenDB());
1341
+
1342
+ // After WAL replay, data should be in a memtable. Flush to create SST with
1343
+ // the trie UDI.
1344
+ ASSERT_OK(db_->Flush(FlushOptions()));
1345
+
1346
+ // Verify data through both indexes.
1347
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("wal_key_01", "wal_val_01"));
1348
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("wal_key_02", "wal_merge"));
1349
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("wal_key_03"));
1350
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("wal_key_04", "wal_val_04"));
1351
+
1352
+ // Forward scan.
1353
+ {
1354
+ std::vector<std::pair<std::string, std::string>> expected = {
1355
+ {"wal_key_01", "wal_val_01"},
1356
+ {"wal_key_02", "wal_merge"},
1357
+ {"wal_key_04", "wal_val_04"}};
1358
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(expected));
1359
+ }
1360
+ }
1361
+
1362
+ // ============================================================================
1363
+ // Multiple column families test
1364
+ // ============================================================================
1365
+
1366
+ TEST_F(TrieIndexDBTest, MultipleColumnFamilies) {
1367
+ // Opens a DB with multiple column families, each using the trie UDI. Writes
1368
+ // different data to each CF, flushes, and verifies reads through both indexes
1369
+ // for each CF. This tests that the UDI builder/reader are correctly isolated
1370
+ // per-CF.
1371
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
1372
+ options_.disable_auto_compactions = true;
1373
+ options_.create_if_missing = true;
1374
+
1375
+ BlockBasedTableOptions table_options;
1376
+ table_options.user_defined_index_factory = trie_factory_;
1377
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
1378
+ last_options_ = options_;
1379
+
1380
+ // Open with default CF first.
1381
+ ASSERT_OK(DB::Open(options_, dbname_, &db_));
1382
+
1383
+ // Create two additional CFs with the same trie UDI options.
1384
+ ColumnFamilyHandle* cf1 = nullptr;
1385
+ ColumnFamilyHandle* cf2 = nullptr;
1386
+ ASSERT_OK(db_->CreateColumnFamily(options_, "cf_one", &cf1));
1387
+ ASSERT_OK(db_->CreateColumnFamily(options_, "cf_two", &cf2));
1388
+
1389
+ // Write different data to each CF.
1390
+ ASSERT_OK(db_->Put(WriteOptions(), "default_key", "default_val"));
1391
+ ASSERT_OK(db_->Put(WriteOptions(), cf1, "cf1_key_a", "cf1_val_a"));
1392
+ ASSERT_OK(db_->Merge(WriteOptions(), cf1, "cf1_key_b", "cf1_merge"));
1393
+ ASSERT_OK(db_->Put(WriteOptions(), cf2, "cf2_key_x", "cf2_val_x"));
1394
+ ASSERT_OK(db_->Delete(WriteOptions(), cf2, "cf2_key_y"));
1395
+ ASSERT_OK(db_->Put(WriteOptions(), cf2, "cf2_key_z", "cf2_val_z"));
1396
+
1397
+ // Flush all CFs.
1398
+ ASSERT_OK(db_->Flush(FlushOptions()));
1399
+ ASSERT_OK(db_->Flush(FlushOptions(), cf1));
1400
+ ASSERT_OK(db_->Flush(FlushOptions(), cf2));
1401
+
1402
+ // Verify default CF.
1403
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1404
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1405
+ std::string value;
1406
+ ASSERT_OK(db_->Get(ro, "default_key", &value));
1407
+ ASSERT_EQ(value, "default_val");
1408
+ }
1409
+
1410
+ // Verify cf_one through both indexes.
1411
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1412
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1413
+ std::string value;
1414
+ ASSERT_OK(db_->Get(ro, cf1, "cf1_key_a", &value));
1415
+ ASSERT_EQ(value, "cf1_val_a");
1416
+ ASSERT_OK(db_->Get(ro, cf1, "cf1_key_b", &value));
1417
+ ASSERT_EQ(value, "cf1_merge");
1418
+ }
1419
+
1420
+ // Verify cf_two through both indexes.
1421
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1422
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1423
+ std::string value;
1424
+ ASSERT_OK(db_->Get(ro, cf2, "cf2_key_x", &value));
1425
+ ASSERT_EQ(value, "cf2_val_x");
1426
+ ASSERT_TRUE(db_->Get(ro, cf2, "cf2_key_y", &value).IsNotFound());
1427
+ ASSERT_OK(db_->Get(ro, cf2, "cf2_key_z", &value));
1428
+ ASSERT_EQ(value, "cf2_val_z");
1429
+ }
1430
+
1431
+ // Forward scan on each CF via both indexes.
1432
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1433
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1434
+
1435
+ // cf_one scan.
1436
+ std::unique_ptr<Iterator> it1(db_->NewIterator(ro, cf1));
1437
+ it1->SeekToFirst();
1438
+ ASSERT_TRUE(it1->Valid());
1439
+ ASSERT_EQ(it1->key().ToString(), "cf1_key_a");
1440
+ it1->Next();
1441
+ ASSERT_TRUE(it1->Valid());
1442
+ ASSERT_EQ(it1->key().ToString(), "cf1_key_b");
1443
+ it1->Next();
1444
+ ASSERT_FALSE(it1->Valid());
1445
+ ASSERT_OK(it1->status());
1446
+
1447
+ // cf_two scan.
1448
+ std::unique_ptr<Iterator> it2(db_->NewIterator(ro, cf2));
1449
+ it2->SeekToFirst();
1450
+ ASSERT_TRUE(it2->Valid());
1451
+ ASSERT_EQ(it2->key().ToString(), "cf2_key_x");
1452
+ it2->Next();
1453
+ ASSERT_TRUE(it2->Valid());
1454
+ ASSERT_EQ(it2->key().ToString(), "cf2_key_z");
1455
+ it2->Next();
1456
+ ASSERT_FALSE(it2->Valid());
1457
+ ASSERT_OK(it2->status());
1458
+ }
1459
+
1460
+ // Clean up CF handles before closing.
1461
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cf1));
1462
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(cf2));
1463
+
1464
+ // Close the DB. Need to clear db_ first since TearDown will also close.
1465
+ ASSERT_OK(db_->Close());
1466
+ db_.reset();
1467
+
1468
+ // Reopen with all CFs to verify persistence.
1469
+ {
1470
+ std::vector<ColumnFamilyDescriptor> cf_descs = {
1471
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, options_),
1472
+ ColumnFamilyDescriptor("cf_one", options_),
1473
+ ColumnFamilyDescriptor("cf_two", options_)};
1474
+ std::vector<ColumnFamilyHandle*> cf_handles;
1475
+ std::unique_ptr<DB> reopen_db;
1476
+ ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cf_handles, &reopen_db));
1477
+ db_ = std::move(reopen_db);
1478
+
1479
+ // Verify data survives reopen.
1480
+ auto ro = TrieIndexReadOptions();
1481
+ std::string value;
1482
+ ASSERT_OK(db_->Get(ro, cf_handles[0], "default_key", &value));
1483
+ ASSERT_EQ(value, "default_val");
1484
+ ASSERT_OK(db_->Get(ro, cf_handles[1], "cf1_key_a", &value));
1485
+ ASSERT_EQ(value, "cf1_val_a");
1486
+ ASSERT_OK(db_->Get(ro, cf_handles[2], "cf2_key_z", &value));
1487
+ ASSERT_EQ(value, "cf2_val_z");
1488
+
1489
+ for (auto* h : cf_handles) {
1490
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
1491
+ }
1492
+ }
1493
+ }
1494
+
1495
+ // ---------------------------------------------------------------------------
1496
+ // BatchedPrefixScan: reproduces the test_batches_snapshots pattern.
1497
+ //
1498
+ // Writes batches of 10 keys {digit+key_body : value_body+digit} for digit in
1499
+ // 0..9, exactly as the crash-test stress tool does. Then scans each prefix
1500
+ // concurrently (same snapshot) and checks that:
1501
+ // (a) all 10 iterators yield the same key bodies in lockstep, and
1502
+ // (b) the values stripped of the trailing digit are identical across
1503
+ // prefixes.
1504
+ //
1505
+ // We run with both the standard index and the trie index and compare.
1506
+ // ---------------------------------------------------------------------------
1507
+ TEST_F(TrieIndexDBTest, BatchedPrefixScan) {
1508
+ // Small block size to force many data blocks (and thus many trie entries).
1509
+ ASSERT_OK(OpenDB(/*block_size=*/256));
1510
+
1511
+ const int kNumBatches = 200;
1512
+ const int kNumPrefixes = 10;
1513
+ Random rnd(42);
1514
+
1515
+ // Phase 1: Write batches.
1516
+ for (int b = 0; b < kNumBatches; ++b) {
1517
+ WriteBatch batch;
1518
+ std::string key_body = MakeKeyBody(b);
1519
+ std::string value_body = rnd.RandomString(20);
1520
+
1521
+ for (int d = kNumPrefixes - 1; d >= 0; --d) {
1522
+ std::string k = std::to_string(d) + key_body;
1523
+ std::string v = value_body + std::to_string(d);
1524
+ ASSERT_OK(batch.Put(k, v));
1525
+ }
1526
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1527
+ }
1528
+
1529
+ // Flush so data is in SSTs with trie index.
1530
+ ASSERT_OK(db_->Flush(FlushOptions()));
1531
+
1532
+ // Phase 2: Prefix scan with both indexes.
1533
+ for (int idx_type = 0; idx_type < 2; ++idx_type) {
1534
+ ReadOptions base_ro =
1535
+ idx_type == 0 ? StandardIndexReadOptions() : TrieIndexReadOptions();
1536
+ SCOPED_TRACE(idx_type == 0 ? "standard index" : "trie index");
1537
+
1538
+ const Snapshot* snap = db_->GetSnapshot();
1539
+ base_ro.snapshot = snap;
1540
+
1541
+ uint64_t count = VerifyPrefixScanLockstep(base_ro, kNumPrefixes,
1542
+ /*use_upper_bounds=*/true,
1543
+ /*verify_values=*/true);
1544
+ ASSERT_EQ(count, static_cast<uint64_t>(kNumBatches))
1545
+ << "expected " << kNumBatches << " entries per prefix";
1546
+
1547
+ db_->ReleaseSnapshot(snap);
1548
+ }
1549
+ }
1550
+
1551
+ // Same as above but with multiple flushes, compaction, and a DB reopen
1552
+ // in between to simulate the crash-recovery path.
1553
+ TEST_F(TrieIndexDBTest, BatchedPrefixScanAfterReopen) {
1554
+ ASSERT_OK(OpenDB(/*block_size=*/256));
1555
+
1556
+ const int kNumBatches = 100;
1557
+ const int kNumPrefixes = 10;
1558
+ Random rnd(123);
1559
+
1560
+ for (int b = 0; b < kNumBatches; ++b) {
1561
+ WriteBatch batch;
1562
+ std::string key_body = MakeKeyBody(b);
1563
+ std::string value_body = rnd.RandomString(20);
1564
+
1565
+ for (int d = kNumPrefixes - 1; d >= 0; --d) {
1566
+ std::string k = std::to_string(d) + key_body;
1567
+ std::string v = value_body + std::to_string(d);
1568
+ ASSERT_OK(batch.Put(k, v));
1569
+ }
1570
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1571
+
1572
+ // Flush every 20 batches to create multiple SSTs.
1573
+ if ((b + 1) % 20 == 0) {
1574
+ ASSERT_OK(db_->Flush(FlushOptions()));
1575
+ }
1576
+ }
1577
+ ASSERT_OK(db_->Flush(FlushOptions()));
1578
+
1579
+ // Compact to merge SSTs.
1580
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1581
+
1582
+ // Close and reopen (simulating recovery).
1583
+ ASSERT_OK(db_->Close());
1584
+ db_.reset();
1585
+ ASSERT_OK(OpenDB(/*block_size=*/256));
1586
+
1587
+ // Prefix scan with trie index after reopen.
1588
+ ReadOptions base_ro = TrieIndexReadOptions();
1589
+ const Snapshot* snap = db_->GetSnapshot();
1590
+ base_ro.snapshot = snap;
1591
+
1592
+ uint64_t count =
1593
+ VerifyPrefixScanLockstep(base_ro, kNumPrefixes, /*use_upper_bounds=*/true,
1594
+ /*verify_values=*/false);
1595
+ ASSERT_EQ(count, static_cast<uint64_t>(kNumBatches));
1596
+ db_->ReleaseSnapshot(snap);
1597
+ }
1598
+
1599
+ // Test with overwrites: multiple writes to the same key body, ensuring
1600
+ // the latest value is consistent across all prefixes.
1601
+ TEST_F(TrieIndexDBTest, BatchedPrefixScanWithOverwrites) {
1602
+ ASSERT_OK(OpenDB(/*block_size=*/256));
1603
+
1604
+ const int kNumKeys = 50;
1605
+ const int kNumOverwrites = 5;
1606
+ const int kNumPrefixes = 10;
1607
+ Random rnd(999);
1608
+
1609
+ // Write each key body multiple times.
1610
+ for (int round = 0; round < kNumOverwrites; ++round) {
1611
+ for (int k = 0; k < kNumKeys; ++k) {
1612
+ WriteBatch batch;
1613
+ std::string key_body = MakeKeyBody(k);
1614
+ std::string value_body = rnd.RandomString(20);
1615
+
1616
+ for (int d = kNumPrefixes - 1; d >= 0; --d) {
1617
+ std::string key = std::to_string(d) + key_body;
1618
+ std::string v = value_body + std::to_string(d);
1619
+ ASSERT_OK(batch.Put(key, v));
1620
+ }
1621
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1622
+ }
1623
+
1624
+ // Flush after each round.
1625
+ ASSERT_OK(db_->Flush(FlushOptions()));
1626
+ }
1627
+
1628
+ // Now verify with both indexes.
1629
+ for (int idx_type = 0; idx_type < 2; ++idx_type) {
1630
+ ReadOptions base_ro =
1631
+ idx_type == 0 ? StandardIndexReadOptions() : TrieIndexReadOptions();
1632
+ SCOPED_TRACE(idx_type == 0 ? "standard index" : "trie index");
1633
+
1634
+ const Snapshot* snap = db_->GetSnapshot();
1635
+ base_ro.snapshot = snap;
1636
+
1637
+ uint64_t count = VerifyPrefixScanLockstep(base_ro, kNumPrefixes,
1638
+ /*use_upper_bounds=*/false,
1639
+ /*verify_values=*/true);
1640
+ ASSERT_EQ(count, static_cast<uint64_t>(kNumKeys));
1641
+ db_->ReleaseSnapshot(snap);
1642
+ }
1643
+ }
1644
+
1645
+ // Stress-like test: write + delete + rewrite many keys, flush between rounds,
1646
+ // then verify prefix scan consistency. Simulates the crash test pattern that
1647
+ // triggered failures.
1648
+ TEST_F(TrieIndexDBTest, BatchedPrefixScanStressLike) {
1649
+ ASSERT_OK(OpenDB(/*block_size=*/4096));
1650
+
1651
+ const int kMaxKey = 10000;
1652
+ const int kNumPrefixes = 10;
1653
+ const int kNumRounds = 20;
1654
+ Random rnd(7777);
1655
+
1656
+ for (int round = 0; round < kNumRounds; ++round) {
1657
+ // Write a batch of random keys
1658
+ int num_writes = 100 + rnd.Uniform(200);
1659
+ for (int w = 0; w < num_writes; ++w) {
1660
+ int k = rnd.Uniform(kMaxKey);
1661
+ WriteBatch batch;
1662
+ std::string key_body = MakeKeyBody(k);
1663
+ std::string value_body = rnd.RandomString(rnd.Uniform(60) + 4);
1664
+ for (int d = kNumPrefixes - 1; d >= 0; --d) {
1665
+ std::string key = std::to_string(d) + key_body;
1666
+ std::string v = value_body + std::to_string(d);
1667
+ ASSERT_OK(batch.Put(key, v));
1668
+ }
1669
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1670
+ }
1671
+
1672
+ // Delete some random keys
1673
+ int num_deletes = 50 + rnd.Uniform(100);
1674
+ for (int w = 0; w < num_deletes; ++w) {
1675
+ int k = rnd.Uniform(kMaxKey);
1676
+ WriteBatch batch;
1677
+ std::string key_body = MakeKeyBody(k);
1678
+ for (int d = kNumPrefixes - 1; d >= 0; --d) {
1679
+ std::string key = std::to_string(d) + key_body;
1680
+ ASSERT_OK(batch.Delete(key));
1681
+ }
1682
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
1683
+ }
1684
+
1685
+ // Flush every few rounds
1686
+ if (round % 3 == 0) {
1687
+ ASSERT_OK(db_->Flush(FlushOptions()));
1688
+ }
1689
+
1690
+ // Verify prefix scan consistency with trie index.
1691
+ {
1692
+ ReadOptions base_ro = TrieIndexReadOptions();
1693
+ const Snapshot* snap = db_->GetSnapshot();
1694
+ base_ro.snapshot = snap;
1695
+
1696
+ VerifyPrefixScanLockstep(base_ro, kNumPrefixes,
1697
+ /*use_upper_bounds=*/false,
1698
+ /*verify_values=*/true,
1699
+ "round=" + std::to_string(round));
1700
+
1701
+ db_->ReleaseSnapshot(snap);
1702
+ }
1703
+ }
1704
+ }
1705
+
1706
+ // ---------------------------------------------------------------------------
1707
+ // Regression test for the FindShortSuccessor last-block bug.
1708
+ //
1709
+ // Before the fix, TrieIndexBuilder::AddIndexEntry called
1710
+ // FindShortSuccessor() on the last block's separator key, producing a
1711
+ // shorter key that covered a wider range than the actual data. For example,
1712
+ // if the last key's user key was "9\xff\xff", FindShortSuccessor would
1713
+ // produce ":" (0x3A), making the trie claim it covers keys up to ":". A
1714
+ // seek for "9\xff\xff\x01" (between the real last key and ":") would find a
1715
+ // block via the trie but not via the standard index, causing prefix scan
1716
+ // iterators to desynchronize.
1717
+ //
1718
+ // The standard ShortenedIndexBuilder (with default kShortenSeparators mode)
1719
+ // does NOT call FindShortSuccessor on the last block — it uses the last key
1720
+ // as-is. The fix makes the trie builder match this behavior.
1721
+ // ---------------------------------------------------------------------------
1722
+ TEST_F(TrieIndexDBTest, LastBlockSeparatorNotShortened) {
1723
+ // Use a small block size so each key lands in its own block.
1724
+ ASSERT_OK(OpenDB(/*block_size=*/32));
1725
+
1726
+ // Write keys where the last key has trailing 0xFF bytes, which
1727
+ // FindShortSuccessor would shorten by incrementing the byte before the
1728
+ // 0xFF suffix ("9\xff\xff" -> ":").
1729
+ ASSERT_OK(db_->Put(WriteOptions(), "1aaa", "v1"));
1730
+ ASSERT_OK(db_->Put(WriteOptions(), "5bbb", "v2"));
1731
+ ASSERT_OK(db_->Put(WriteOptions(), std::string("9\xff\xff", 3), "v3"));
1732
+ ASSERT_OK(db_->Flush(FlushOptions()));
1733
+
1734
+ // The key "9\xff\xff\x01" is lexicographically after "9\xff\xff" but
1735
+ // before ":" (0x3A). With the old bug, the trie would return a valid
1736
+ // block for this key. With the fix, both indexes correctly say "not
1737
+ // found".
1738
+ std::string seek_target = std::string("9\xff\xff\x01", 4);
1739
+
1740
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1741
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1742
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1743
+
1744
+ iter->Seek(seek_target);
1745
+ ASSERT_TRUE(!iter->Valid()) << "Expected no key at or after seek_target, "
1746
+ << "but got: " << iter->key().ToString(true);
1747
+ ASSERT_OK(iter->status());
1748
+ }
1749
+
1750
+ // Also verify the actual last key is still findable.
1751
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1752
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1753
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1754
+
1755
+ iter->Seek(std::string("9\xff\xff", 3));
1756
+ ASSERT_TRUE(iter->Valid());
1757
+ ASSERT_EQ(iter->key().ToString(), std::string("9\xff\xff", 3));
1758
+ ASSERT_EQ(iter->value().ToString(), "v3");
1759
+
1760
+ // After this key, there should be nothing more.
1761
+ iter->Next();
1762
+ ASSERT_TRUE(!iter->Valid());
1763
+ ASSERT_OK(iter->status());
1764
+ }
1765
+ }
1766
+
1767
+ // Variant: tests that when deletes remove the last key, seeking past the last
1768
+ // remaining key correctly returns "not found" with both indexes.
1769
+ TEST_F(TrieIndexDBTest, LastBlockSeparatorWithDeletes) {
1770
+ ASSERT_OK(OpenDB(/*block_size=*/32));
1771
+
1772
+ // Write and flush initial data.
1773
+ ASSERT_OK(db_->Put(WriteOptions(), "1aaa", "v1"));
1774
+ ASSERT_OK(db_->Put(WriteOptions(), "5bbb", "v2"));
1775
+ ASSERT_OK(db_->Put(WriteOptions(), std::string("9\xff\xff", 3), "v3"));
1776
+ ASSERT_OK(db_->Flush(FlushOptions()));
1777
+
1778
+ // Delete the last key and flush (creates a tombstone in a new SST).
1779
+ ASSERT_OK(db_->Delete(WriteOptions(), std::string("9\xff\xff", 3)));
1780
+ ASSERT_OK(db_->Flush(FlushOptions()));
1781
+
1782
+ // Now seeking for the deleted key should yield "5bbb" or nothing,
1783
+ // depending on the seek target. Both indexes must agree.
1784
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1785
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1786
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1787
+
1788
+ // Seek to the deleted key — should skip it and land on nothing (it was
1789
+ // the last key).
1790
+ iter->Seek(std::string("9\xff\xff", 3));
1791
+ ASSERT_TRUE(!iter->Valid())
1792
+ << "Deleted key should not be visible, but got: "
1793
+ << iter->key().ToString(true);
1794
+ ASSERT_OK(iter->status());
1795
+
1796
+ // Seek to a key between "5bbb" and the deleted key — should find "5bbb"
1797
+ // or nothing depending on order. Actually, "6" > "5bbb" and "6" <
1798
+ // "9\xff\xff", so seeking "6" should find nothing since there's no key
1799
+ // >= "6" that's still alive.
1800
+ iter->Seek("6");
1801
+ ASSERT_TRUE(!iter->Valid()) << "No live key >= '6' should exist, but got: "
1802
+ << iter->key().ToString(true);
1803
+ ASSERT_OK(iter->status());
1804
+ }
1805
+
1806
+ // Compact to merge the tombstone, then verify again.
1807
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1808
+
1809
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1810
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1811
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1812
+
1813
+ iter->SeekToFirst();
1814
+ ASSERT_TRUE(iter->Valid());
1815
+ ASSERT_EQ(iter->key().ToString(), "1aaa");
1816
+ iter->Next();
1817
+ ASSERT_TRUE(iter->Valid());
1818
+ ASSERT_EQ(iter->key().ToString(), "5bbb");
1819
+ iter->Next();
1820
+ ASSERT_TRUE(!iter->Valid());
1821
+ ASSERT_OK(iter->status());
1822
+ }
1823
+ }
1824
+
1825
+ // Single-entry SST: the trie has exactly one leaf. Validates that Seek,
1826
+ // SeekToFirst, Next, and Get all work with a one-block, one-key SST.
1827
+ TEST_F(TrieIndexDBTest, SingleEntrySST) {
1828
+ ASSERT_OK(OpenDB());
1829
+ ASSERT_OK(db_->Put(WriteOptions(), "only_key", "only_val"));
1830
+ ASSERT_OK(db_->Flush(FlushOptions()));
1831
+
1832
+ // Point lookup.
1833
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("only_key", "only_val"));
1834
+
1835
+ // Forward scan: exactly one result.
1836
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(
1837
+ std::vector<std::pair<std::string, std::string>>{
1838
+ {"only_key", "only_val"}}));
1839
+
1840
+ // Seek to the exact key.
1841
+ ASSERT_NO_FATAL_FAILURE(
1842
+ VerifySeekBothIndexes("only_key", "only_key", "only_val"));
1843
+
1844
+ // Seek before the key — should land on it.
1845
+ ASSERT_NO_FATAL_FAILURE(VerifySeekBothIndexes("a", "only_key", "only_val"));
1846
+
1847
+ // Seek past the key — should be invalid.
1848
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1849
+ SCOPED_TRACE(ro.table_index_factory ? "trie index" : "standard index");
1850
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1851
+ iter->Seek("z");
1852
+ ASSERT_FALSE(iter->Valid());
1853
+ ASSERT_OK(iter->status());
1854
+ }
1855
+ }
1856
+
1857
+ // Deletion-only SST: flush a Put, then flush a Delete for that key so the
1858
+ // second SST contains only a tombstone. After compaction, the key is gone.
1859
+ TEST_F(TrieIndexDBTest, DeletionOnlySST) {
1860
+ ASSERT_OK(OpenDB());
1861
+
1862
+ // Flush 1: a real Put.
1863
+ ASSERT_OK(db_->Put(WriteOptions(), "del_target", "val"));
1864
+ ASSERT_OK(db_->Flush(FlushOptions()));
1865
+
1866
+ // Flush 2: only a Delete — this creates an SST whose only entry is a
1867
+ // tombstone (the trie still builds an index for the block containing it).
1868
+ ASSERT_OK(db_->Delete(WriteOptions(), "del_target"));
1869
+ ASSERT_OK(db_->Flush(FlushOptions()));
1870
+
1871
+ // The tombstone hides the Put.
1872
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("del_target"));
1873
+
1874
+ // Forward scan: nothing visible.
1875
+ ASSERT_NO_FATAL_FAILURE(
1876
+ VerifyForwardScanBothIndexes(std::vector<std::string>{}));
1877
+
1878
+ // Compact to merge: key is fully removed.
1879
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1880
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("del_target"));
1881
+ ASSERT_NO_FATAL_FAILURE(
1882
+ VerifyForwardScanBothIndexes(std::vector<std::string>{}));
1883
+ }
1884
+
1885
+ // All-same-key SST: multiple versions of the same user key (via snapshots)
1886
+ // land in the same SST, possibly spanning multiple blocks. Validates that
1887
+ // the trie's same-key-run handling (seqno-based separators) works at the
1888
+ // DB level through both indexes.
1889
+ TEST_F(TrieIndexDBTest, AllSameKeySST) {
1890
+ options_.disable_auto_compactions = true;
1891
+ // Small block size to force multiple blocks for the same user key.
1892
+ ASSERT_OK(OpenDB(/*block_size=*/32));
1893
+
1894
+ // Write several versions of the same key with snapshots to prevent GC.
1895
+ std::vector<const Snapshot*> snaps;
1896
+ for (int i = 0; i < 10; i++) {
1897
+ ASSERT_OK(db_->Put(WriteOptions(), "same_key", "val_" + std::to_string(i)));
1898
+ snaps.push_back(db_->GetSnapshot());
1899
+ }
1900
+ ASSERT_OK(db_->Flush(FlushOptions()));
1901
+
1902
+ // Latest value is visible.
1903
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("same_key", "val_9"));
1904
+
1905
+ // Forward scan: only the latest version is visible (without snapshot).
1906
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(
1907
+ std::vector<std::pair<std::string, std::string>>{{"same_key", "val_9"}}));
1908
+
1909
+ // Each snapshot should see the correct version.
1910
+ for (int i = 0; i < 10; i++) {
1911
+ SCOPED_TRACE("snapshot " + std::to_string(i));
1912
+ std::string expected = "val_" + std::to_string(i);
1913
+ ASSERT_NO_FATAL_FAILURE(
1914
+ VerifyGetBothIndexes(snaps[i], "same_key", expected));
1915
+
1916
+ // Forward scan with snapshot.
1917
+ ASSERT_NO_FATAL_FAILURE(
1918
+ VerifyForwardScanBothIndexes(snaps[i], {{"same_key", expected}}));
1919
+ }
1920
+
1921
+ // Seek with earliest snapshot — should find the earliest version.
1922
+ ASSERT_NO_FATAL_FAILURE(
1923
+ VerifySeekBothIndexes(snaps[0], "same_key", "same_key", "val_0"));
1924
+
1925
+ for (auto* snap : snaps) {
1926
+ db_->ReleaseSnapshot(snap);
1927
+ }
1928
+ }
1929
+
1930
+ // Operations on a completely empty DB: nothing should crash, and after
1931
+ // creating + deleting all data, the DB should correctly return nothing.
1932
+ TEST_F(TrieIndexDBTest, EmptyDBOperations) {
1933
+ ASSERT_OK(OpenDB());
1934
+
1935
+ // Get / Seek / SeekToFirst on empty memtable (no SSTs yet).
1936
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("anything"));
1937
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1938
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
1939
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1940
+ iter->SeekToFirst();
1941
+ ASSERT_FALSE(iter->Valid());
1942
+ ASSERT_OK(iter->status());
1943
+ iter->Seek("anything");
1944
+ ASSERT_FALSE(iter->Valid());
1945
+ ASSERT_OK(iter->status());
1946
+ }
1947
+
1948
+ // Create an SST, delete its only key, compact → DB has no live data but
1949
+ // the trie code path was exercised during flush.
1950
+ ASSERT_OK(db_->Put(WriteOptions(), "temp", "val"));
1951
+ ASSERT_OK(db_->Flush(FlushOptions()));
1952
+ ASSERT_OK(db_->Delete(WriteOptions(), "temp"));
1953
+ ASSERT_OK(db_->Flush(FlushOptions()));
1954
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1955
+
1956
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("temp"));
1957
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1958
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
1959
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1960
+ iter->SeekToFirst();
1961
+ ASSERT_FALSE(iter->Valid());
1962
+ ASSERT_OK(iter->status());
1963
+ }
1964
+ }
1965
+
1966
+ // Focused seek-pattern tests: before all data, between blocks, exact match,
1967
+ // after all data, and empty-key seek.
1968
+ TEST_F(TrieIndexDBTest, SeekEdgeCases) {
1969
+ ASSERT_OK(OpenDB(/*block_size=*/64));
1970
+
1971
+ // Write keys with deliberate gaps.
1972
+ for (const auto& k : {"bbb", "ddd", "fff", "hhh"}) {
1973
+ ASSERT_OK(db_->Put(WriteOptions(), k, std::string("v_") + k));
1974
+ }
1975
+ ASSERT_OK(db_->Flush(FlushOptions()));
1976
+
1977
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
1978
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
1979
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
1980
+
1981
+ // Before first key.
1982
+ iter->Seek("aaa");
1983
+ ASSERT_TRUE(iter->Valid());
1984
+ ASSERT_EQ(iter->key().ToString(), "bbb");
1985
+
1986
+ // Exact first key.
1987
+ iter->Seek("bbb");
1988
+ ASSERT_TRUE(iter->Valid());
1989
+ ASSERT_EQ(iter->key().ToString(), "bbb");
1990
+
1991
+ // Between keys.
1992
+ iter->Seek("ccc");
1993
+ ASSERT_TRUE(iter->Valid());
1994
+ ASSERT_EQ(iter->key().ToString(), "ddd");
1995
+
1996
+ // Between keys (eee → fff).
1997
+ iter->Seek("eee");
1998
+ ASSERT_TRUE(iter->Valid());
1999
+ ASSERT_EQ(iter->key().ToString(), "fff");
2000
+
2001
+ // Exact last key.
2002
+ iter->Seek("hhh");
2003
+ ASSERT_TRUE(iter->Valid());
2004
+ ASSERT_EQ(iter->key().ToString(), "hhh");
2005
+
2006
+ // After last key.
2007
+ iter->Seek("zzz");
2008
+ ASSERT_FALSE(iter->Valid());
2009
+ ASSERT_OK(iter->status());
2010
+
2011
+ // Empty key (smallest possible key for BytewiseComparator).
2012
+ iter->Seek("");
2013
+ ASSERT_TRUE(iter->Valid());
2014
+ ASSERT_EQ(iter->key().ToString(), "bbb");
2015
+ }
2016
+ }
2017
+
2018
+ // PutEntity + GetEntity through the trie index read path.
2019
+ TEST_F(TrieIndexDBTest, GetEntityWithTrieUDI) {
2020
+ ASSERT_OK(OpenDB());
2021
+
2022
+ // PutEntity with wide columns.
2023
+ WideColumns columns{
2024
+ {kDefaultWideColumnName, "default_val"},
2025
+ {"col_a", "val_a"},
2026
+ {"col_b", "val_b"},
2027
+ };
2028
+ ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
2029
+ "entity_key", columns));
2030
+ // Also a regular Put to verify GetEntity reads it as a single default column.
2031
+ ASSERT_OK(db_->Put(WriteOptions(), "regular_key", "regular_val"));
2032
+ ASSERT_OK(db_->Flush(FlushOptions()));
2033
+
2034
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2035
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2036
+
2037
+ // GetEntity on a PutEntity key.
2038
+ PinnableWideColumns result;
2039
+ ASSERT_OK(
2040
+ db_->GetEntity(ro, db_->DefaultColumnFamily(), "entity_key", &result));
2041
+ ASSERT_EQ(result.columns().size(), 3u);
2042
+ ASSERT_EQ(result.columns()[0].name(), kDefaultWideColumnName);
2043
+ ASSERT_EQ(result.columns()[0].value(), "default_val");
2044
+ ASSERT_EQ(result.columns()[1].name(), "col_a");
2045
+ ASSERT_EQ(result.columns()[1].value(), "val_a");
2046
+ ASSERT_EQ(result.columns()[2].name(), "col_b");
2047
+ ASSERT_EQ(result.columns()[2].value(), "val_b");
2048
+
2049
+ // GetEntity on a regular Put key returns single default column.
2050
+ PinnableWideColumns result2;
2051
+ ASSERT_OK(db_->GetEntity(ro, db_->DefaultColumnFamily(), "regular_key",
2052
+ &result2));
2053
+ ASSERT_EQ(result2.columns().size(), 1u);
2054
+ ASSERT_EQ(result2.columns()[0].name(), kDefaultWideColumnName);
2055
+ ASSERT_EQ(result2.columns()[0].value(), "regular_val");
2056
+
2057
+ // GetEntity on nonexistent key.
2058
+ PinnableWideColumns result3;
2059
+ ASSERT_TRUE(
2060
+ db_->GetEntity(ro, db_->DefaultColumnFamily(), "no_such_key", &result3)
2061
+ .IsNotFound());
2062
+ }
2063
+ }
2064
+
2065
+ // Multiple overlapping L0 SSTs: the level iterator must coordinate trie
2066
+ // iterators across multiple SST files with overlapping key ranges.
2067
+ TEST_F(TrieIndexDBTest, OverlappingL0SSTs) {
2068
+ options_.disable_auto_compactions = true;
2069
+ ASSERT_OK(OpenDB(/*block_size=*/128));
2070
+
2071
+ // SST1: keys 00..49.
2072
+ for (int i = 0; i < 50; i++) {
2073
+ char key[16];
2074
+ snprintf(key, sizeof(key), "key_%03d", i);
2075
+ ASSERT_OK(db_->Put(WriteOptions(), key, "sst1_" + std::to_string(i)));
2076
+ }
2077
+ ASSERT_OK(db_->Flush(FlushOptions()));
2078
+
2079
+ // SST2: keys 25..74 (overlapping with SST1).
2080
+ for (int i = 25; i < 75; i++) {
2081
+ char key[16];
2082
+ snprintf(key, sizeof(key), "key_%03d", i);
2083
+ ASSERT_OK(db_->Put(WriteOptions(), key, "sst2_" + std::to_string(i)));
2084
+ }
2085
+ ASSERT_OK(db_->Flush(FlushOptions()));
2086
+
2087
+ // SST3: keys 50..99 (overlapping with SST2).
2088
+ for (int i = 50; i < 100; i++) {
2089
+ char key[16];
2090
+ snprintf(key, sizeof(key), "key_%03d", i);
2091
+ ASSERT_OK(db_->Put(WriteOptions(), key, "sst3_" + std::to_string(i)));
2092
+ }
2093
+ ASSERT_OK(db_->Flush(FlushOptions()));
2094
+
2095
+ // Verify: latest writer wins for overlapping keys.
2096
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2097
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2098
+ auto kvs = ScanAllKeyValues(ro);
2099
+ ASSERT_EQ(kvs.size(), 100u);
2100
+ for (int i = 0; i < 100; i++) {
2101
+ char key[16];
2102
+ snprintf(key, sizeof(key), "key_%03d", i);
2103
+ ASSERT_EQ(kvs[i].first, key);
2104
+ if (i < 25) {
2105
+ ASSERT_EQ(kvs[i].second, "sst1_" + std::to_string(i));
2106
+ } else if (i < 50) {
2107
+ ASSERT_EQ(kvs[i].second, "sst2_" + std::to_string(i));
2108
+ } else {
2109
+ ASSERT_EQ(kvs[i].second, "sst3_" + std::to_string(i));
2110
+ }
2111
+ }
2112
+ }
2113
+
2114
+ // Compact all L0 → L1, re-verify.
2115
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2116
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2117
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2118
+ ASSERT_EQ(ScanAllKeyValues(ro).size(), 100u);
2119
+ }
2120
+ }
2121
+
2122
+ // CompactRange with a sub-range: only part of the key space is compacted.
2123
+ TEST_F(TrieIndexDBTest, CompactRangeSubset) {
2124
+ options_.disable_auto_compactions = true;
2125
+ ASSERT_OK(OpenDB(/*block_size=*/128));
2126
+
2127
+ for (int i = 0; i < 26; i++) {
2128
+ char key[16];
2129
+ snprintf(key, sizeof(key), "key_%c", 'a' + i);
2130
+ ASSERT_OK(db_->Put(WriteOptions(), key, "val_" + std::to_string(i)));
2131
+ }
2132
+ ASSERT_OK(db_->Flush(FlushOptions()));
2133
+
2134
+ // Compact only the middle range [key_f, key_p).
2135
+ std::string begin = "key_f";
2136
+ std::string end = "key_p";
2137
+ Slice begin_s(begin);
2138
+ Slice end_s(end);
2139
+ CompactRangeOptions cro;
2140
+ ASSERT_OK(db_->CompactRange(cro, &begin_s, &end_s));
2141
+
2142
+ // All 26 keys should still be readable.
2143
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2144
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2145
+ ASSERT_EQ(ScanAllKeys(ro).size(), 26u);
2146
+ }
2147
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_a", "val_0"));
2148
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_z", "val_25"));
2149
+ }
2150
+
2151
+ // Write keys, delete all of them, compact. The DB should be empty.
2152
+ TEST_F(TrieIndexDBTest, AllKeysDeletedCompaction) {
2153
+ ASSERT_OK(OpenDB());
2154
+
2155
+ for (int i = 0; i < 20; i++) {
2156
+ char key[16];
2157
+ snprintf(key, sizeof(key), "key_%02d", i);
2158
+ ASSERT_OK(db_->Put(WriteOptions(), key, "val"));
2159
+ }
2160
+ ASSERT_OK(db_->Flush(FlushOptions()));
2161
+
2162
+ // Delete all keys.
2163
+ for (int i = 0; i < 20; i++) {
2164
+ char key[16];
2165
+ snprintf(key, sizeof(key), "key_%02d", i);
2166
+ ASSERT_OK(db_->Delete(WriteOptions(), key));
2167
+ }
2168
+ ASSERT_OK(db_->Flush(FlushOptions()));
2169
+
2170
+ // Before compaction: tombstones hide all keys.
2171
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2172
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2173
+ ASSERT_EQ(ScanAllKeys(ro).size(), 0u);
2174
+ }
2175
+
2176
+ // After compaction: all tombstones and data are gone.
2177
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2178
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2179
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2180
+ ASSERT_EQ(ScanAllKeys(ro).size(), 0u);
2181
+ }
2182
+ }
2183
+
2184
+ // Keys with special byte values: 0x00, 0xFF, embedded nulls, very short keys.
2185
+ // These exercise trie byte-traversal edge cases.
2186
+ TEST_F(TrieIndexDBTest, BinaryKeyEdgeCases) {
2187
+ ASSERT_OK(OpenDB(/*block_size=*/64));
2188
+
2189
+ // All keys in sorted order (BytewiseComparator).
2190
+ std::vector<std::pair<std::string, std::string>> kvs = {
2191
+ {std::string("\x00", 1), "val_null"},
2192
+ {std::string("\x00\x00\x00", 3), "val_triple_null"},
2193
+ {std::string("\x01", 1), "val_0x01"},
2194
+ {"a", "val_a"},
2195
+ {std::string("a\x00"
2196
+ "b",
2197
+ 3),
2198
+ "val_a_null_b"},
2199
+ {"mid", "val_mid"},
2200
+ {std::string("\xfe", 1), "val_0xfe"},
2201
+ {std::string("\xff", 1), "val_0xff"},
2202
+ {std::string("\xff\xff\xff", 3), "val_triple_ff"},
2203
+ };
2204
+
2205
+ for (const auto& kv : kvs) {
2206
+ ASSERT_OK(db_->Put(WriteOptions(), kv.first, kv.second));
2207
+ }
2208
+ ASSERT_OK(db_->Flush(FlushOptions()));
2209
+
2210
+ // Forward scan: all keys in order through both indexes.
2211
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2212
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2213
+ auto actual = ScanAllKeyValues(ro);
2214
+ ASSERT_EQ(actual.size(), kvs.size());
2215
+ for (size_t i = 0; i < kvs.size(); i++) {
2216
+ SCOPED_TRACE("key index " + std::to_string(i));
2217
+ ASSERT_EQ(actual[i].first, kvs[i].first);
2218
+ ASSERT_EQ(actual[i].second, kvs[i].second);
2219
+ }
2220
+ }
2221
+
2222
+ // Point lookups for boundary keys.
2223
+ ASSERT_NO_FATAL_FAILURE(
2224
+ VerifyGetBothIndexes(std::string("\x00", 1), "val_null"));
2225
+ ASSERT_NO_FATAL_FAILURE(
2226
+ VerifyGetBothIndexes(std::string("\xff\xff\xff", 3), "val_triple_ff"));
2227
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes(std::string("a\x00"
2228
+ "b",
2229
+ 3),
2230
+ "val_a_null_b"));
2231
+
2232
+ // Seek to embedded-null key.
2233
+ ASSERT_NO_FATAL_FAILURE(VerifySeekBothIndexes(
2234
+ std::string("\x00", 1), std::string("\x00", 1), "val_null"));
2235
+ }
2236
+
2237
+ // Puts with empty string values.
2238
+ TEST_F(TrieIndexDBTest, EmptyValuePuts) {
2239
+ ASSERT_OK(OpenDB());
2240
+
2241
+ ASSERT_OK(db_->Put(WriteOptions(), "key1", ""));
2242
+ ASSERT_OK(db_->Put(WriteOptions(), "key2", "non_empty"));
2243
+ ASSERT_OK(db_->Put(WriteOptions(), "key3", ""));
2244
+ ASSERT_OK(db_->Flush(FlushOptions()));
2245
+
2246
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key1", ""));
2247
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key2", "non_empty"));
2248
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key3", ""));
2249
+
2250
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(
2251
+ std::vector<std::pair<std::string, std::string>>{
2252
+ {"key1", ""}, {"key2", "non_empty"}, {"key3", ""}}));
2253
+ }
2254
+
2255
+ // Zlib compression: data blocks are compressed, UDI block is not.
2256
+ // Verifies that reads through the trie index work with compressed data.
2257
+ TEST_F(TrieIndexDBTest, CompressionZlib) {
2258
+ if (!Zlib_Supported()) {
2259
+ ROCKSDB_GTEST_SKIP("Zlib not linked");
2260
+ return;
2261
+ }
2262
+ options_.compression = kZlibCompression;
2263
+ ASSERT_OK(OpenDB(/*block_size=*/128));
2264
+
2265
+ for (int i = 0; i < 100; i++) {
2266
+ char key[16];
2267
+ snprintf(key, sizeof(key), "key_%04d", i);
2268
+ // Compressible value (repeated pattern).
2269
+ ASSERT_OK(db_->Put(WriteOptions(), key, std::string(200, 'A' + (i % 26))));
2270
+ }
2271
+ ASSERT_OK(db_->Flush(FlushOptions()));
2272
+
2273
+ // Forward scan.
2274
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2275
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2276
+ ASSERT_EQ(ScanAllKeys(ro).size(), 100u);
2277
+ }
2278
+
2279
+ // Spot-check a few keys.
2280
+ for (int i : {0, 49, 99}) {
2281
+ char key[16];
2282
+ snprintf(key, sizeof(key), "key_%04d", i);
2283
+ ASSERT_NO_FATAL_FAILURE(
2284
+ VerifyGetBothIndexes(key, std::string(200, 'A' + (i % 26))));
2285
+ }
2286
+ }
2287
+
2288
+ // Iterator stability: an iterator pinned to a snapshot should not see data
2289
+ // written after the iterator was created, even after flush.
2290
+ TEST_F(TrieIndexDBTest, IteratorStabilityDuringFlush) {
2291
+ ASSERT_OK(OpenDB());
2292
+
2293
+ ASSERT_OK(db_->Put(WriteOptions(), "key1", "v1"));
2294
+ ASSERT_OK(db_->Put(WriteOptions(), "key2", "v2"));
2295
+ ASSERT_OK(db_->Flush(FlushOptions()));
2296
+
2297
+ // Open iterator (implicitly pins a snapshot).
2298
+ auto ro = TrieIndexReadOptions();
2299
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
2300
+ iter->SeekToFirst();
2301
+ ASSERT_TRUE(iter->Valid());
2302
+ ASSERT_EQ(iter->key().ToString(), "key1");
2303
+
2304
+ // Write + flush new data while iterator is open.
2305
+ ASSERT_OK(db_->Put(WriteOptions(), "key3", "v3"));
2306
+ ASSERT_OK(db_->Flush(FlushOptions()));
2307
+
2308
+ // Existing iterator should NOT see key3.
2309
+ iter->Next();
2310
+ ASSERT_TRUE(iter->Valid());
2311
+ ASSERT_EQ(iter->key().ToString(), "key2");
2312
+ iter->Next();
2313
+ ASSERT_FALSE(iter->Valid());
2314
+ ASSERT_OK(iter->status());
2315
+
2316
+ // New iterator should see all three keys.
2317
+ std::unique_ptr<Iterator> iter2(db_->NewIterator(ro));
2318
+ iter2->SeekToFirst();
2319
+ ASSERT_TRUE(iter2->Valid());
2320
+ ASSERT_EQ(iter2->key().ToString(), "key1");
2321
+ iter2->Next();
2322
+ ASSERT_TRUE(iter2->Valid());
2323
+ ASSERT_EQ(iter2->key().ToString(), "key2");
2324
+ iter2->Next();
2325
+ ASSERT_TRUE(iter2->Valid());
2326
+ ASSERT_EQ(iter2->key().ToString(), "key3");
2327
+ ASSERT_OK(iter2->status());
2328
+ }
2329
+
2330
+ // iterate_upper_bound without prefix scan: the iterator should stop at the
2331
+ // upper bound.
2332
+ TEST_F(TrieIndexDBTest, IteratorUpperBound) {
2333
+ ASSERT_OK(OpenDB(/*block_size=*/64));
2334
+
2335
+ for (const auto& k : {"aa", "bb", "cc", "dd", "ee", "ff"}) {
2336
+ ASSERT_OK(db_->Put(WriteOptions(), k, std::string("v_") + k));
2337
+ }
2338
+ ASSERT_OK(db_->Flush(FlushOptions()));
2339
+
2340
+ for (const auto& base_ro :
2341
+ {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2342
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie" : "standard");
2343
+
2344
+ // Upper bound = "dd" → should see aa, bb, cc only.
2345
+ std::string ub_str = "dd";
2346
+ Slice ub(ub_str);
2347
+ ReadOptions ro = base_ro;
2348
+ ro.iterate_upper_bound = &ub;
2349
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
2350
+ std::vector<std::string> keys;
2351
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
2352
+ keys.push_back(iter->key().ToString());
2353
+ }
2354
+ ASSERT_OK(iter->status());
2355
+ ASSERT_EQ(keys, (std::vector<std::string>{"aa", "bb", "cc"}));
2356
+
2357
+ // Upper bound = "aa" → should see nothing.
2358
+ std::string ub2_str = "aa";
2359
+ Slice ub2(ub2_str);
2360
+ ReadOptions ro2 = base_ro;
2361
+ ro2.iterate_upper_bound = &ub2;
2362
+ std::unique_ptr<Iterator> iter2(db_->NewIterator(ro2));
2363
+ iter2->SeekToFirst();
2364
+ ASSERT_FALSE(iter2->Valid());
2365
+ ASSERT_OK(iter2->status());
2366
+
2367
+ // Upper bound after all data → should see everything.
2368
+ std::string ub3_str = "zz";
2369
+ Slice ub3(ub3_str);
2370
+ ReadOptions ro3 = base_ro;
2371
+ ro3.iterate_upper_bound = &ub3;
2372
+ std::unique_ptr<Iterator> iter3(db_->NewIterator(ro3));
2373
+ std::vector<std::string> all_keys;
2374
+ for (iter3->SeekToFirst(); iter3->Valid(); iter3->Next()) {
2375
+ all_keys.push_back(iter3->key().ToString());
2376
+ }
2377
+ ASSERT_OK(iter3->status());
2378
+ ASSERT_EQ(all_keys.size(), 6u);
2379
+ }
2380
+ }
2381
+
2382
+ // Combined snapshot + upper_bound: iterator sees the snapshot's view of data,
2383
+ // bounded by iterate_upper_bound.
2384
+ TEST_F(TrieIndexDBTest, IteratorSnapshotAndUpperBound) {
2385
+ ASSERT_OK(OpenDB());
2386
+
2387
+ ASSERT_OK(db_->Put(WriteOptions(), "key_a", "old_a"));
2388
+ ASSERT_OK(db_->Put(WriteOptions(), "key_b", "old_b"));
2389
+ ASSERT_OK(db_->Put(WriteOptions(), "key_c", "old_c"));
2390
+ ASSERT_OK(db_->Put(WriteOptions(), "key_d", "old_d"));
2391
+ ASSERT_OK(db_->Flush(FlushOptions()));
2392
+
2393
+ const Snapshot* snap = db_->GetSnapshot();
2394
+
2395
+ // Overwrite some keys after the snapshot.
2396
+ ASSERT_OK(db_->Put(WriteOptions(), "key_a", "new_a"));
2397
+ ASSERT_OK(db_->Put(WriteOptions(), "key_c", "new_c"));
2398
+ ASSERT_OK(db_->Put(WriteOptions(), "key_e", "new_e"));
2399
+ ASSERT_OK(db_->Flush(FlushOptions()));
2400
+
2401
+ for (const auto& base_ro :
2402
+ {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2403
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie" : "standard");
2404
+
2405
+ std::string ub_str = "key_d";
2406
+ Slice ub(ub_str);
2407
+ ReadOptions ro = base_ro;
2408
+ ro.snapshot = snap;
2409
+ ro.iterate_upper_bound = &ub;
2410
+
2411
+ auto kvs = ScanAllKeyValues(ro);
2412
+ // Snapshot view: old values. Upper bound excludes key_d and key_e.
2413
+ ASSERT_EQ(kvs.size(), 3u);
2414
+ ASSERT_EQ(kvs[0],
2415
+ std::make_pair(std::string("key_a"), std::string("old_a")));
2416
+ ASSERT_EQ(kvs[1],
2417
+ std::make_pair(std::string("key_b"), std::string("old_b")));
2418
+ ASSERT_EQ(kvs[2],
2419
+ std::make_pair(std::string("key_c"), std::string("old_c")));
2420
+ }
2421
+ db_->ReleaseSnapshot(snap);
2422
+ }
2423
+
2424
+ // VerifyChecksum goes through SeekToFirst+Next on the index iterator.
2425
+ TEST_F(TrieIndexDBTest, VerifyChecksumWithTrieUDI) {
2426
+ ASSERT_OK(OpenDB(/*block_size=*/128));
2427
+
2428
+ for (int i = 0; i < 50; i++) {
2429
+ char key[16];
2430
+ snprintf(key, sizeof(key), "key_%03d", i);
2431
+ ASSERT_OK(db_->Put(WriteOptions(), key, "value_" + std::to_string(i)));
2432
+ }
2433
+ ASSERT_OK(db_->Flush(FlushOptions()));
2434
+
2435
+ // VerifyChecksum with default ReadOptions (standard index).
2436
+ ASSERT_OK(db_->VerifyChecksum());
2437
+
2438
+ // VerifyChecksum with trie ReadOptions.
2439
+ ASSERT_OK(db_->VerifyChecksum(TrieIndexReadOptions()));
2440
+ }
2441
+
2442
+ // Many small SSTs from frequent flushes: exercises trie iteration across
2443
+ // many L0 files without compaction.
2444
+ TEST_F(TrieIndexDBTest, ManySmallSSTs) {
2445
+ options_.disable_auto_compactions = true;
2446
+ ASSERT_OK(OpenDB());
2447
+
2448
+ // 50 flushes, 2 keys each → 50 SSTs.
2449
+ for (int f = 0; f < 50; f++) {
2450
+ char k1[16];
2451
+ char k2[16];
2452
+ snprintf(k1, sizeof(k1), "key_%04d", f * 2);
2453
+ snprintf(k2, sizeof(k2), "key_%04d", f * 2 + 1);
2454
+ ASSERT_OK(db_->Put(WriteOptions(), k1, "v" + std::to_string(f * 2)));
2455
+ ASSERT_OK(db_->Put(WriteOptions(), k2, "v" + std::to_string(f * 2 + 1)));
2456
+ ASSERT_OK(db_->Flush(FlushOptions()));
2457
+ }
2458
+
2459
+ // Verify all 100 keys are readable.
2460
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2461
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2462
+ auto keys = ScanAllKeys(ro);
2463
+ ASSERT_EQ(keys.size(), 100u);
2464
+ }
2465
+
2466
+ // Spot-check first and last.
2467
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_0000", "v0"));
2468
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_0099", "v99"));
2469
+
2470
+ // Compact everything into one SST, re-verify.
2471
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2472
+ for (const auto& ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2473
+ SCOPED_TRACE(ro.table_index_factory ? "trie" : "standard");
2474
+ ASSERT_EQ(ScanAllKeys(ro).size(), 100u);
2475
+ }
2476
+ }
2477
+
2478
+ // Merge values accumulate across multiple compaction rounds.
2479
+ TEST_F(TrieIndexDBTest, MergeAcrossMultipleCompactions) {
2480
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator();
2481
+ ASSERT_OK(OpenDB());
2482
+
2483
+ // Round 1: Put base value.
2484
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "base"));
2485
+ ASSERT_OK(db_->Flush(FlushOptions()));
2486
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2487
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key", "base"));
2488
+
2489
+ // Round 2: Merge "m1".
2490
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "m1"));
2491
+ ASSERT_OK(db_->Flush(FlushOptions()));
2492
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2493
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key", "base,m1"));
2494
+
2495
+ // Round 3: Merge "m2".
2496
+ ASSERT_OK(db_->Merge(WriteOptions(), "key", "m2"));
2497
+ ASSERT_OK(db_->Flush(FlushOptions()));
2498
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2499
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key", "base,m1,m2"));
2500
+
2501
+ // Forward scan also returns the accumulated value.
2502
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(
2503
+ std::vector<std::pair<std::string, std::string>>{{"key", "base,m1,m2"}}));
2504
+ }
2505
+
2506
+ // Graceful degradation: reopen a DB that was written with UDI, but without
2507
+ // the UDI factory configured. Reads should fall back to the standard index.
2508
+ TEST_F(TrieIndexDBTest, ReopenWithoutTrieUDI) {
2509
+ ASSERT_OK(OpenDB());
2510
+
2511
+ ASSERT_OK(db_->Put(WriteOptions(), "key_a", "val_a"));
2512
+ ASSERT_OK(db_->Put(WriteOptions(), "key_b", "val_b"));
2513
+ ASSERT_OK(db_->Flush(FlushOptions()));
2514
+ ASSERT_OK(db_->Close());
2515
+ db_.reset();
2516
+
2517
+ // Reopen WITHOUT UDI. The SST has a UDI meta block, but it's ignored.
2518
+ ASSERT_OK(OpenDBWithoutUDI());
2519
+
2520
+ // Reads via standard index should work (UDI meta block is just ignored).
2521
+ std::string val;
2522
+ ASSERT_OK(db_->Get(ReadOptions(), "key_a", &val));
2523
+ ASSERT_EQ(val, "val_a");
2524
+ ASSERT_OK(db_->Get(ReadOptions(), "key_b", &val));
2525
+ ASSERT_EQ(val, "val_b");
2526
+
2527
+ // Forward scan.
2528
+ auto keys = ScanAllKeys(ReadOptions());
2529
+ ASSERT_EQ(keys.size(), 2u);
2530
+ ASSERT_EQ(keys[0], "key_a");
2531
+ ASSERT_EQ(keys[1], "key_b");
2532
+ }
2533
+
2534
+ // Mixed SSTs: some written with UDI, some without. Both should be readable
2535
+ // through both index paths.
2536
+ TEST_F(TrieIndexDBTest, MixedSSTsWithAndWithoutUDI) {
2537
+ options_.disable_auto_compactions = true;
2538
+
2539
+ // Phase 1: Write with UDI → SST1 has UDI + standard index.
2540
+ ASSERT_OK(OpenDB());
2541
+ ASSERT_OK(db_->Put(WriteOptions(), "key_01", "udi_val1"));
2542
+ ASSERT_OK(db_->Put(WriteOptions(), "key_02", "udi_val2"));
2543
+ ASSERT_OK(db_->Flush(FlushOptions()));
2544
+ ASSERT_OK(db_->Close());
2545
+ db_.reset();
2546
+
2547
+ // Phase 2: Reopen WITHOUT UDI, write more → SST2 has only standard index.
2548
+ ASSERT_OK(OpenDBWithoutUDI());
2549
+ ASSERT_OK(db_->Put(WriteOptions(), "key_03", "noudi_val3"));
2550
+ ASSERT_OK(db_->Put(WriteOptions(), "key_04", "noudi_val4"));
2551
+ ASSERT_OK(db_->Flush(FlushOptions()));
2552
+ ASSERT_OK(db_->Close());
2553
+ db_.reset();
2554
+
2555
+ // Phase 3: Reopen WITH UDI again. SST1 uses trie, SST2 falls back to
2556
+ // standard index (UDI block missing → logged warning, graceful fallback).
2557
+ options_.disable_auto_compactions = true;
2558
+ ASSERT_OK(OpenDB());
2559
+
2560
+ // All 4 keys should be readable through both index paths.
2561
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_01", "udi_val1"));
2562
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_02", "udi_val2"));
2563
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_03", "noudi_val3"));
2564
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("key_04", "noudi_val4"));
2565
+
2566
+ ASSERT_NO_FATAL_FAILURE(
2567
+ VerifyForwardScanBothIndexes({"key_01", "key_02", "key_03", "key_04"}));
2568
+
2569
+ // Compact: merges UDI + non-UDI SSTs → new SST has UDI.
2570
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2571
+ ASSERT_NO_FATAL_FAILURE(
2572
+ VerifyForwardScanBothIndexes({"key_01", "key_02", "key_03", "key_04"}));
2573
+ }
2574
+
2575
+ // TransactionDB commit: Put + Delete inside a transaction, then commit.
2576
+ TEST_F(TrieIndexDBTest, TransactionCommit) {
2577
+ options_.create_if_missing = true;
2578
+ BlockBasedTableOptions table_options;
2579
+ table_options.user_defined_index_factory = trie_factory_;
2580
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
2581
+ last_options_ = options_;
2582
+
2583
+ TransactionDB* txn_db = nullptr;
2584
+ ASSERT_OK(
2585
+ TransactionDB::Open(options_, TransactionDBOptions(), dbname_, &txn_db));
2586
+ db_.reset(txn_db);
2587
+
2588
+ // Pre-populate a key.
2589
+ ASSERT_OK(txn_db->Put(WriteOptions(), "pre_key", "pre_val"));
2590
+ ASSERT_OK(txn_db->Flush(FlushOptions()));
2591
+
2592
+ // Begin transaction: Put + Delete + Commit.
2593
+ std::unique_ptr<Transaction> txn(
2594
+ txn_db->BeginTransaction(WriteOptions(), TransactionOptions()));
2595
+ ASSERT_OK(txn->Put("txn_key1", "txn_val1"));
2596
+ ASSERT_OK(txn->Delete("pre_key"));
2597
+ ASSERT_OK(txn->Commit());
2598
+
2599
+ ASSERT_OK(txn_db->Flush(FlushOptions()));
2600
+
2601
+ // Verify through both indexes.
2602
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("txn_key1", "txn_val1"));
2603
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("pre_key"));
2604
+ }
2605
+
2606
+ // TransactionDB rollback: writes should be discarded. Rollback writes DELETE
2607
+ // entries to WAL, which was previously restricted for UDI.
2608
+ TEST_F(TrieIndexDBTest, TransactionRollback) {
2609
+ options_.create_if_missing = true;
2610
+ BlockBasedTableOptions table_options;
2611
+ table_options.user_defined_index_factory = trie_factory_;
2612
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
2613
+ last_options_ = options_;
2614
+
2615
+ TransactionDB* txn_db = nullptr;
2616
+ ASSERT_OK(
2617
+ TransactionDB::Open(options_, TransactionDBOptions(), dbname_, &txn_db));
2618
+ db_.reset(txn_db);
2619
+
2620
+ // Pre-populate data and flush.
2621
+ ASSERT_OK(txn_db->Put(WriteOptions(), "keep_key", "keep_val"));
2622
+ ASSERT_OK(txn_db->Flush(FlushOptions()));
2623
+
2624
+ // Begin transaction, write, then ROLLBACK.
2625
+ std::unique_ptr<Transaction> txn(
2626
+ txn_db->BeginTransaction(WriteOptions(), TransactionOptions()));
2627
+ ASSERT_OK(txn->Put("rollback_key", "rollback_val"));
2628
+ ASSERT_OK(txn->Delete("keep_key"));
2629
+ ASSERT_OK(txn->Rollback());
2630
+
2631
+ ASSERT_OK(txn_db->Flush(FlushOptions()));
2632
+
2633
+ // Original data should be unchanged. Rolled-back writes should not appear.
2634
+ ASSERT_NO_FATAL_FAILURE(VerifyGetBothIndexes("keep_key", "keep_val"));
2635
+ ASSERT_NO_FATAL_FAILURE(VerifyGetNotFoundBothIndexes("rollback_key"));
2636
+
2637
+ // Forward scan: only the original key.
2638
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(
2639
+ std::vector<std::pair<std::string, std::string>>{
2640
+ {"keep_key", "keep_val"}}));
2641
+ }
2642
+
2643
+ // total_order_seek with prefix_extractor: a common stress-test configuration.
2644
+ // With total_order_seek=true, SeekToFirst and full forward scan should work
2645
+ // correctly even when a prefix extractor is configured.
2646
+ TEST_F(TrieIndexDBTest, TotalOrderSeekWithPrefixExtractor) {
2647
+ options_.prefix_extractor.reset(NewFixedPrefixTransform(3));
2648
+ ASSERT_OK(OpenDB(/*block_size=*/128));
2649
+
2650
+ // Keys with different prefixes.
2651
+ ASSERT_OK(db_->Put(WriteOptions(), "aaa_1", "v1"));
2652
+ ASSERT_OK(db_->Put(WriteOptions(), "aaa_2", "v2"));
2653
+ ASSERT_OK(db_->Put(WriteOptions(), "bbb_1", "v3"));
2654
+ ASSERT_OK(db_->Put(WriteOptions(), "ccc_1", "v4"));
2655
+ ASSERT_OK(db_->Flush(FlushOptions()));
2656
+
2657
+ // With total_order_seek=true, scan all keys across prefixes.
2658
+ for (auto base_ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2659
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie" : "standard");
2660
+ base_ro.total_order_seek = true;
2661
+ auto keys = ScanAllKeys(base_ro);
2662
+ ASSERT_EQ(keys.size(), 4u);
2663
+ ASSERT_EQ(keys[0], "aaa_1");
2664
+ ASSERT_EQ(keys[1], "aaa_2");
2665
+ ASSERT_EQ(keys[2], "bbb_1");
2666
+ ASSERT_EQ(keys[3], "ccc_1");
2667
+
2668
+ // Seek across prefix boundary.
2669
+ std::unique_ptr<Iterator> iter(db_->NewIterator(base_ro));
2670
+ iter->Seek("aab");
2671
+ ASSERT_TRUE(iter->Valid());
2672
+ ASSERT_EQ(iter->key().ToString(), "bbb_1");
2673
+ ASSERT_OK(iter->status());
2674
+ }
2675
+
2676
+ // auto_prefix_mode: let RocksDB decide per-seek.
2677
+ for (auto base_ro : {StandardIndexReadOptions(), TrieIndexReadOptions()}) {
2678
+ SCOPED_TRACE(base_ro.table_index_factory ? "trie" : "standard");
2679
+ base_ro.auto_prefix_mode = true;
2680
+ std::unique_ptr<Iterator> iter(db_->NewIterator(base_ro));
2681
+ iter->Seek("bbb_1");
2682
+ ASSERT_TRUE(iter->Valid());
2683
+ ASSERT_EQ(iter->key().ToString(), "bbb_1");
2684
+ ASSERT_OK(iter->status());
2685
+ }
2686
+ }
2687
+
2688
+ // ============================================================================
2689
+ // Multi-level SST + DeleteRange randomized test
2690
+ //
2691
+ // Historically bug-prone area: range tombstones interact with data across
2692
+ // LSM levels (L0, L1, L2+), and the trie index must correctly handle
2693
+ // seek/scan when blocks are partially or entirely covered by range deletions
2694
+ // at different levels.
2695
+ //
2696
+ // Strategy:
2697
+ // 1. Populate bottommost level with baseline data (flush + compact)
2698
+ // 2. Write overlapping data and DeleteRanges to L0 (multiple rounds)
2699
+ // 3. Partial compactions to create data at intermediate levels
2700
+ // 4. Verify reads match between standard and trie index after each mutation
2701
+ // 5. Snapshot before large DeleteRange, verify snapshot preserves state
2702
+ // 6. Re-insert into deleted ranges, compact, and re-verify
2703
+ // ============================================================================
2704
+ TEST_F(TrieIndexDBTest, MultiLevelDeleteRangeRandomized) {
2705
+ uint32_t seed = static_cast<uint32_t>(
2706
+ std::chrono::system_clock::now().time_since_epoch().count());
2707
+ SCOPED_TRACE("seed=" + std::to_string(seed));
2708
+ Random rnd(seed);
2709
+
2710
+ options_.disable_auto_compactions = true;
2711
+ // Small block size forces many data blocks (and thus many trie entries).
2712
+ ASSERT_OK(OpenDB(/*block_size=*/256));
2713
+
2714
+ const int kMaxKey = 500;
2715
+
2716
+ auto format_key = [](int k) {
2717
+ char buf[16];
2718
+ snprintf(buf, sizeof(buf), "key_%05d", k);
2719
+ return std::string(buf);
2720
+ };
2721
+
2722
+ // Core correctness check: forward scan via both indexes must match.
2723
+ auto verify_scan_consistency = [&]() {
2724
+ auto standard_kvs = ScanAllKeyValues(StandardIndexReadOptions());
2725
+ auto trie_kvs = ScanAllKeyValues(TrieIndexReadOptions());
2726
+ ASSERT_EQ(standard_kvs, trie_kvs)
2727
+ << "Scan mismatch: standard=" << standard_kvs.size()
2728
+ << " trie=" << trie_kvs.size();
2729
+ };
2730
+
2731
+ // Phase 1: Populate bottommost level with baseline data.
2732
+ for (int i = 0; i < 200; i++) {
2733
+ int k = rnd.Uniform(kMaxKey);
2734
+ ASSERT_OK(db_->Put(WriteOptions(), format_key(k),
2735
+ "base_" + rnd.RandomString(20)));
2736
+ }
2737
+ ASSERT_OK(db_->Flush(FlushOptions()));
2738
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2739
+ ASSERT_NO_FATAL_FAILURE(verify_scan_consistency());
2740
+
2741
+ // Phase 2: Write overlapping data + DeleteRanges across multiple rounds.
2742
+ // Each round creates L0 SSTs with a mix of Puts and DeleteRanges,
2743
+ // with occasional partial compactions to push data to intermediate levels.
2744
+ for (int round = 0; round < 5; round++) {
2745
+ SCOPED_TRACE("round=" + std::to_string(round));
2746
+
2747
+ // Write some new/updated keys.
2748
+ int num_writes = 30 + rnd.Uniform(70);
2749
+ for (int i = 0; i < num_writes; i++) {
2750
+ int k = rnd.Uniform(kMaxKey);
2751
+ ASSERT_OK(
2752
+ db_->Put(WriteOptions(), format_key(k),
2753
+ "r" + std::to_string(round) + "_" + rnd.RandomString(15)));
2754
+ }
2755
+
2756
+ // Issue 1-3 random DeleteRanges per round.
2757
+ int num_ranges = 1 + rnd.Uniform(3);
2758
+ for (int r = 0; r < num_ranges; r++) {
2759
+ int range_start = rnd.Uniform(kMaxKey - 10);
2760
+ int range_end = range_start + 5 + rnd.Uniform(50);
2761
+ if (range_end > kMaxKey) {
2762
+ range_end = kMaxKey;
2763
+ }
2764
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
2765
+ format_key(range_start),
2766
+ format_key(range_end)));
2767
+ }
2768
+
2769
+ ASSERT_OK(db_->Flush(FlushOptions()));
2770
+ ASSERT_NO_FATAL_FAILURE(verify_scan_consistency());
2771
+
2772
+ // On odd rounds, do a partial compaction to push some data down,
2773
+ // creating a multi-level structure where range tombstones at L0
2774
+ // must shadow data at L1/L2.
2775
+ if (round % 2 == 1) {
2776
+ int compact_start = rnd.Uniform(kMaxKey / 2);
2777
+ int compact_end = compact_start + kMaxKey / 4;
2778
+ std::string start_key = format_key(compact_start);
2779
+ std::string end_key = format_key(compact_end);
2780
+ Slice s(start_key);
2781
+ Slice e(end_key);
2782
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &s, &e));
2783
+ ASSERT_NO_FATAL_FAILURE(verify_scan_consistency());
2784
+ }
2785
+ }
2786
+
2787
+ // Phase 3: Snapshot, then delete a large range. The snapshot must
2788
+ // preserve the pre-deletion state while current reads see the deletion.
2789
+ const Snapshot* snap = db_->GetSnapshot();
2790
+ auto snap_kvs = ScanAllKeyValues(StandardIndexReadOptions());
2791
+
2792
+ int big_start = rnd.Uniform(kMaxKey / 4);
2793
+ int big_end = big_start + kMaxKey / 3;
2794
+ if (big_end > kMaxKey) {
2795
+ big_end = kMaxKey;
2796
+ }
2797
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
2798
+ format_key(big_start), format_key(big_end)));
2799
+ ASSERT_OK(db_->Flush(FlushOptions()));
2800
+
2801
+ // Current state should reflect the deletion.
2802
+ ASSERT_NO_FATAL_FAILURE(verify_scan_consistency());
2803
+
2804
+ // Snapshot state should be unchanged.
2805
+ ASSERT_NO_FATAL_FAILURE(VerifyForwardScanBothIndexes(snap, snap_kvs));
2806
+
2807
+ db_->ReleaseSnapshot(snap);
2808
+
2809
+ // Phase 4: Re-insert keys into the deleted range, creating a pattern
2810
+ // where range tombstones and live data coexist at different levels.
2811
+ for (int i = big_start; i < big_end && i < kMaxKey; i += 3) {
2812
+ ASSERT_OK(db_->Put(WriteOptions(), format_key(i),
2813
+ "reinserted_" + rnd.RandomString(10)));
2814
+ }
2815
+ ASSERT_OK(db_->Flush(FlushOptions()));
2816
+ ASSERT_NO_FATAL_FAILURE(verify_scan_consistency());
2817
+
2818
+ // Phase 5: Full compaction — all range tombstones should be resolved.
2819
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2820
+ ASSERT_NO_FATAL_FAILURE(verify_scan_consistency());
2821
+
2822
+ // Phase 6: Point lookups for a sample of keys — both indexes must agree.
2823
+ for (int i = 0; i < kMaxKey; i += 7) {
2824
+ std::string key = format_key(i);
2825
+ std::string std_val;
2826
+ std::string trie_val;
2827
+ Status s1 = db_->Get(StandardIndexReadOptions(), key, &std_val);
2828
+ Status s2 = db_->Get(TrieIndexReadOptions(), key, &trie_val);
2829
+ ASSERT_EQ(s1.code(), s2.code()) << "Status mismatch for " << key;
2830
+ if (s1.ok()) {
2831
+ ASSERT_EQ(std_val, trie_val) << "Value mismatch for " << key;
2832
+ }
2833
+ }
2834
+ }
2835
+
2836
+ } // namespace trie_index
2837
+ } // namespace ROCKSDB_NAMESPACE
2838
+
2839
+ int main(int argc, char** argv) {
2840
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
2841
+ ::testing::InitGoogleTest(&argc, argv);
2842
+ return RUN_ALL_TESTS();
2843
+ }