rocksdb-native 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. package/binding.c +92 -10
  2. package/index.js +9 -0
  3. package/lib/batch.js +11 -1
  4. package/lib/iterator.js +3 -1
  5. package/lib/snapshot.js +21 -0
  6. package/package.json +1 -1
  7. package/prebuilds/darwin-arm64/rocksdb-native.bare +0 -0
  8. package/prebuilds/darwin-arm64/rocksdb-native.node +0 -0
  9. package/prebuilds/darwin-x64/rocksdb-native.bare +0 -0
  10. package/prebuilds/darwin-x64/rocksdb-native.node +0 -0
  11. package/prebuilds/linux-arm64/rocksdb-native.bare +0 -0
  12. package/prebuilds/linux-arm64/rocksdb-native.node +0 -0
  13. package/prebuilds/linux-x64/rocksdb-native.bare +0 -0
  14. package/prebuilds/linux-x64/rocksdb-native.node +0 -0
  15. package/prebuilds/win32-x64/rocksdb-native.bare +0 -0
  16. package/prebuilds/win32-x64/rocksdb-native.node +0 -0
  17. package/vendor/librocksdb/include/rocksdb.h +38 -4
  18. package/vendor/librocksdb/src/rocksdb.cc +114 -14
  19. package/vendor/librocksdb/vendor/rocksdb/CMakeLists.txt +21 -4
  20. package/vendor/librocksdb/vendor/rocksdb/cache/secondary_cache_adapter.cc +6 -3
  21. package/vendor/librocksdb/vendor/rocksdb/db/arena_wrapped_db_iter.cc +4 -4
  22. package/vendor/librocksdb/vendor/rocksdb/db/arena_wrapped_db_iter.h +4 -2
  23. package/vendor/librocksdb/vendor/rocksdb/db/attribute_group_iterator_impl.cc +20 -0
  24. package/vendor/librocksdb/vendor/rocksdb/db/attribute_group_iterator_impl.h +83 -0
  25. package/vendor/librocksdb/vendor/rocksdb/db/builder.cc +9 -5
  26. package/vendor/librocksdb/vendor/rocksdb/db/builder.h +1 -1
  27. package/vendor/librocksdb/vendor/rocksdb/db/c.cc +231 -6
  28. package/vendor/librocksdb/vendor/rocksdb/db/c_test.c +202 -2
  29. package/vendor/librocksdb/vendor/rocksdb/db/coalescing_iterator.cc +47 -0
  30. package/vendor/librocksdb/vendor/rocksdb/db/coalescing_iterator.h +79 -0
  31. package/vendor/librocksdb/vendor/rocksdb/db/column_family.cc +28 -0
  32. package/vendor/librocksdb/vendor/rocksdb/db/column_family.h +17 -0
  33. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction.cc +8 -1
  34. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction.h +11 -9
  35. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_iterator.cc +50 -23
  36. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_iterator.h +13 -0
  37. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_job.cc +22 -25
  38. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_job.h +2 -0
  39. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_outputs.cc +8 -1
  40. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_outputs.h +1 -0
  41. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker.cc +40 -17
  42. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker.h +20 -14
  43. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker_level.cc +11 -6
  44. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_picker_universal.cc +77 -24
  45. package/vendor/librocksdb/vendor/rocksdb/db/compaction/compaction_service_job.cc +2 -0
  46. package/vendor/librocksdb/vendor/rocksdb/db/convenience.cc +3 -0
  47. package/vendor/librocksdb/vendor/rocksdb/db/db_filesnapshot.cc +125 -31
  48. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl.cc +457 -231
  49. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl.h +172 -73
  50. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_compaction_flush.cc +152 -133
  51. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  52. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_files.cc +58 -52
  53. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_follower.cc +348 -0
  54. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_follower.h +54 -0
  55. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_open.cc +136 -117
  56. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_secondary.cc +4 -3
  57. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_secondary.h +7 -6
  58. package/vendor/librocksdb/vendor/rocksdb/db/db_impl/db_impl_write.cc +134 -80
  59. package/vendor/librocksdb/vendor/rocksdb/db/db_iter.cc +11 -0
  60. package/vendor/librocksdb/vendor/rocksdb/db/db_test2.cc +1 -1
  61. package/vendor/librocksdb/vendor/rocksdb/db/db_test_util.cc +11 -1
  62. package/vendor/librocksdb/vendor/rocksdb/db/db_test_util.h +11 -7
  63. package/vendor/librocksdb/vendor/rocksdb/db/dbformat.cc +19 -4
  64. package/vendor/librocksdb/vendor/rocksdb/db/dbformat.h +3 -2
  65. package/vendor/librocksdb/vendor/rocksdb/db/error_handler.cc +34 -39
  66. package/vendor/librocksdb/vendor/rocksdb/db/error_handler.h +3 -4
  67. package/vendor/librocksdb/vendor/rocksdb/db/event_helpers.cc +6 -3
  68. package/vendor/librocksdb/vendor/rocksdb/db/experimental.cc +3 -2
  69. package/vendor/librocksdb/vendor/rocksdb/db/external_sst_file_ingestion_job.cc +76 -18
  70. package/vendor/librocksdb/vendor/rocksdb/db/external_sst_file_ingestion_job.h +11 -0
  71. package/vendor/librocksdb/vendor/rocksdb/db/flush_job.cc +37 -5
  72. package/vendor/librocksdb/vendor/rocksdb/db/flush_job.h +14 -0
  73. package/vendor/librocksdb/vendor/rocksdb/db/import_column_family_job.cc +49 -45
  74. package/vendor/librocksdb/vendor/rocksdb/db/internal_stats.cc +60 -1
  75. package/vendor/librocksdb/vendor/rocksdb/db/internal_stats.h +20 -1
  76. package/vendor/librocksdb/vendor/rocksdb/db/log_reader.cc +15 -6
  77. package/vendor/librocksdb/vendor/rocksdb/db/log_writer.cc +59 -10
  78. package/vendor/librocksdb/vendor/rocksdb/db/log_writer.h +8 -0
  79. package/vendor/librocksdb/vendor/rocksdb/db/memtable.cc +24 -40
  80. package/vendor/librocksdb/vendor/rocksdb/db/memtable.h +10 -10
  81. package/vendor/librocksdb/vendor/rocksdb/db/memtable_list.cc +9 -8
  82. package/vendor/librocksdb/vendor/rocksdb/db/multi_cf_iterator_impl.h +296 -0
  83. package/vendor/librocksdb/vendor/rocksdb/db/range_tombstone_fragmenter.h +8 -10
  84. package/vendor/librocksdb/vendor/rocksdb/db/repair.cc +4 -3
  85. package/vendor/librocksdb/vendor/rocksdb/db/seqno_to_time_mapping.cc +30 -0
  86. package/vendor/librocksdb/vendor/rocksdb/db/seqno_to_time_mapping.h +9 -0
  87. package/vendor/librocksdb/vendor/rocksdb/db/table_cache.cc +17 -2
  88. package/vendor/librocksdb/vendor/rocksdb/db/table_cache.h +9 -1
  89. package/vendor/librocksdb/vendor/rocksdb/db/table_properties_collector.h +9 -2
  90. package/vendor/librocksdb/vendor/rocksdb/db/transaction_log_impl.cc +3 -3
  91. package/vendor/librocksdb/vendor/rocksdb/db/transaction_log_impl.h +7 -7
  92. package/vendor/librocksdb/vendor/rocksdb/db/version_edit.cc +0 -1
  93. package/vendor/librocksdb/vendor/rocksdb/db/version_edit_handler.cc +39 -5
  94. package/vendor/librocksdb/vendor/rocksdb/db/version_edit_handler.h +24 -15
  95. package/vendor/librocksdb/vendor/rocksdb/db/version_set.cc +117 -64
  96. package/vendor/librocksdb/vendor/rocksdb/db/version_set.h +27 -10
  97. package/vendor/librocksdb/vendor/rocksdb/db/wal_manager.cc +37 -29
  98. package/vendor/librocksdb/vendor/rocksdb/db/wal_manager.h +6 -5
  99. package/vendor/librocksdb/vendor/rocksdb/db/wide/wide_columns.cc +2 -3
  100. package/vendor/librocksdb/vendor/rocksdb/db/wide/wide_columns_helper.cc +6 -0
  101. package/vendor/librocksdb/vendor/rocksdb/db/write_batch.cc +89 -31
  102. package/vendor/librocksdb/vendor/rocksdb/db/write_thread.cc +53 -5
  103. package/vendor/librocksdb/vendor/rocksdb/db/write_thread.h +36 -4
  104. package/vendor/librocksdb/vendor/rocksdb/env/composite_env_wrapper.h +21 -0
  105. package/vendor/librocksdb/vendor/rocksdb/env/env.cc +15 -0
  106. package/vendor/librocksdb/vendor/rocksdb/env/fs_on_demand.cc +331 -0
  107. package/vendor/librocksdb/vendor/rocksdb/env/fs_on_demand.h +139 -0
  108. package/vendor/librocksdb/vendor/rocksdb/env/io_posix.cc +8 -6
  109. package/vendor/librocksdb/vendor/rocksdb/env/io_posix.h +1 -1
  110. package/vendor/librocksdb/vendor/rocksdb/file/delete_scheduler.cc +130 -27
  111. package/vendor/librocksdb/vendor/rocksdb/file/delete_scheduler.h +61 -8
  112. package/vendor/librocksdb/vendor/rocksdb/file/file_util.cc +25 -4
  113. package/vendor/librocksdb/vendor/rocksdb/file/file_util.h +15 -0
  114. package/vendor/librocksdb/vendor/rocksdb/file/sequence_file_reader.cc +1 -0
  115. package/vendor/librocksdb/vendor/rocksdb/file/sequence_file_reader.h +9 -4
  116. package/vendor/librocksdb/vendor/rocksdb/file/sst_file_manager_impl.cc +18 -0
  117. package/vendor/librocksdb/vendor/rocksdb/file/sst_file_manager_impl.h +31 -4
  118. package/vendor/librocksdb/vendor/rocksdb/file/writable_file_writer.cc +40 -38
  119. package/vendor/librocksdb/vendor/rocksdb/file/writable_file_writer.h +48 -15
  120. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/advanced_options.h +12 -3
  121. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/attribute_groups.h +114 -0
  122. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/c.h +90 -0
  123. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/cache.h +5 -0
  124. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/comparator.h +27 -0
  125. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/db.h +71 -12
  126. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/env.h +9 -0
  127. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/experimental.h +5 -0
  128. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/file_system.h +14 -0
  129. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/iterator.h +9 -71
  130. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/iterator_base.h +90 -0
  131. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/listener.h +21 -0
  132. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/options.h +125 -12
  133. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/perf_context.h +1 -1
  134. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/sst_file_reader.h +11 -1
  135. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/table.h +6 -6
  136. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/table_properties.h +19 -0
  137. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/transaction_log.h +12 -6
  138. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/types.h +12 -0
  139. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/universal_compaction.h +31 -0
  140. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/user_write_callback.h +29 -0
  141. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/cache_dump_load.h +4 -0
  142. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/checkpoint.h +4 -2
  143. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/customizable_util.h +0 -1
  144. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/env_mirror.h +1 -1
  145. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/ldb_cmd.h +24 -7
  146. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/option_change_migration.h +4 -4
  147. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/stackable_db.h +24 -5
  148. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/table_properties_collectors.h +46 -0
  149. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/transaction.h +42 -17
  150. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/transaction_db.h +5 -0
  151. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/types_util.h +36 -0
  152. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/utilities/write_batch_with_index.h +71 -3
  153. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/version.h +2 -2
  154. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/wide_columns.h +87 -72
  155. package/vendor/librocksdb/vendor/rocksdb/include/rocksdb/write_batch_base.h +1 -1
  156. package/vendor/librocksdb/vendor/rocksdb/memory/memory_allocator.cc +1 -0
  157. package/vendor/librocksdb/vendor/rocksdb/options/cf_options.cc +13 -2
  158. package/vendor/librocksdb/vendor/rocksdb/options/cf_options.h +6 -2
  159. package/vendor/librocksdb/vendor/rocksdb/options/db_options.cc +27 -1
  160. package/vendor/librocksdb/vendor/rocksdb/options/db_options.h +10 -3
  161. package/vendor/librocksdb/vendor/rocksdb/options/options.cc +3 -0
  162. package/vendor/librocksdb/vendor/rocksdb/options/options_helper.cc +1 -0
  163. package/vendor/librocksdb/vendor/rocksdb/port/jemalloc_helper.h +2 -2
  164. package/vendor/librocksdb/vendor/rocksdb/port/stack_trace.cc +1 -0
  165. package/vendor/librocksdb/vendor/rocksdb/port/win/port_win.cc +3 -2
  166. package/vendor/librocksdb/vendor/rocksdb/table/block_based/binary_search_index_reader.cc +1 -2
  167. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_builder.cc +47 -31
  168. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_factory.cc +15 -0
  169. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_iterator.cc +37 -18
  170. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_iterator.h +10 -3
  171. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader.cc +102 -41
  172. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader.h +15 -7
  173. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader_impl.h +1 -3
  174. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +5 -6
  175. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_cache.h +31 -0
  176. package/vendor/librocksdb/vendor/rocksdb/table/block_based/block_prefetcher.cc +6 -0
  177. package/vendor/librocksdb/vendor/rocksdb/table/block_based/cachable_entry.h +10 -5
  178. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_block.h +34 -28
  179. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_block_reader_common.cc +17 -11
  180. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_block_reader_common.h +5 -2
  181. package/vendor/librocksdb/vendor/rocksdb/table/block_based/filter_policy.cc +12 -3
  182. package/vendor/librocksdb/vendor/rocksdb/table/block_based/full_filter_block.cc +37 -30
  183. package/vendor/librocksdb/vendor/rocksdb/table/block_based/full_filter_block.h +11 -13
  184. package/vendor/librocksdb/vendor/rocksdb/table/block_based/hash_index_reader.cc +1 -2
  185. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_builder.cc +62 -53
  186. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_builder.h +60 -38
  187. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_reader_common.cc +14 -9
  188. package/vendor/librocksdb/vendor/rocksdb/table/block_based/index_reader_common.h +4 -1
  189. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_filter_block.cc +135 -94
  190. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_filter_block.h +52 -46
  191. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_index_reader.cc +51 -13
  192. package/vendor/librocksdb/vendor/rocksdb/table/block_based/partitioned_index_reader.h +2 -0
  193. package/vendor/librocksdb/vendor/rocksdb/table/block_based/uncompression_dict_reader.cc +3 -11
  194. package/vendor/librocksdb/vendor/rocksdb/table/block_based/uncompression_dict_reader.h +2 -3
  195. package/vendor/librocksdb/vendor/rocksdb/table/block_fetcher.cc +8 -10
  196. package/vendor/librocksdb/vendor/rocksdb/table/block_fetcher.h +2 -1
  197. package/vendor/librocksdb/vendor/rocksdb/table/compaction_merging_iterator.cc +9 -10
  198. package/vendor/librocksdb/vendor/rocksdb/table/compaction_merging_iterator.h +3 -2
  199. package/vendor/librocksdb/vendor/rocksdb/table/format.cc +1 -2
  200. package/vendor/librocksdb/vendor/rocksdb/table/iterator.cc +4 -0
  201. package/vendor/librocksdb/vendor/rocksdb/table/merging_iterator.cc +18 -13
  202. package/vendor/librocksdb/vendor/rocksdb/table/merging_iterator.h +5 -3
  203. package/vendor/librocksdb/vendor/rocksdb/table/meta_blocks.cc +18 -4
  204. package/vendor/librocksdb/vendor/rocksdb/table/meta_blocks.h +4 -0
  205. package/vendor/librocksdb/vendor/rocksdb/table/plain/plain_table_builder.cc +2 -2
  206. package/vendor/librocksdb/vendor/rocksdb/table/sst_file_dumper.cc +6 -6
  207. package/vendor/librocksdb/vendor/rocksdb/table/sst_file_reader.cc +24 -2
  208. package/vendor/librocksdb/vendor/rocksdb/table/sst_file_writer_collectors.h +3 -1
  209. package/vendor/librocksdb/vendor/rocksdb/table/table_builder.h +8 -7
  210. package/vendor/librocksdb/vendor/rocksdb/table/table_iterator.h +69 -0
  211. package/vendor/librocksdb/vendor/rocksdb/table/table_reader.h +9 -0
  212. package/vendor/librocksdb/vendor/rocksdb/test_util/testutil.cc +25 -0
  213. package/vendor/librocksdb/vendor/rocksdb/test_util/testutil.h +12 -0
  214. package/vendor/librocksdb/vendor/rocksdb/tools/db_bench_tool.cc +32 -0
  215. package/vendor/librocksdb/vendor/rocksdb/tools/ldb_cmd.cc +618 -124
  216. package/vendor/librocksdb/vendor/rocksdb/tools/ldb_cmd_impl.h +19 -1
  217. package/vendor/librocksdb/vendor/rocksdb/tools/ldb_tool.cc +9 -0
  218. package/vendor/librocksdb/vendor/rocksdb/util/aligned_storage.h +24 -0
  219. package/vendor/librocksdb/vendor/rocksdb/util/autovector.h +4 -0
  220. package/vendor/librocksdb/vendor/rocksdb/util/comparator.cc +12 -0
  221. package/vendor/librocksdb/vendor/rocksdb/util/filter_bench.cc +1 -1
  222. package/vendor/librocksdb/vendor/rocksdb/util/random.cc +2 -1
  223. package/vendor/librocksdb/vendor/rocksdb/util/stderr_logger.cc +3 -4
  224. package/vendor/librocksdb/vendor/rocksdb/util/stderr_logger.h +1 -1
  225. package/vendor/librocksdb/vendor/rocksdb/util/udt_util.cc +33 -0
  226. package/vendor/librocksdb/vendor/rocksdb/util/udt_util.h +7 -0
  227. package/vendor/librocksdb/vendor/rocksdb/util/write_batch_util.h +5 -0
  228. package/vendor/librocksdb/vendor/rocksdb/util/xxhash.h +36 -29
  229. package/vendor/librocksdb/vendor/rocksdb/utilities/blob_db/blob_db_impl.h +3 -0
  230. package/vendor/librocksdb/vendor/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc +20 -0
  231. package/vendor/librocksdb/vendor/rocksdb/utilities/cache_dump_load_impl.cc +29 -9
  232. package/vendor/librocksdb/vendor/rocksdb/utilities/cache_dump_load_impl.h +14 -3
  233. package/vendor/librocksdb/vendor/rocksdb/utilities/debug.cc +16 -4
  234. package/vendor/librocksdb/vendor/rocksdb/utilities/fault_injection_fs.cc +677 -248
  235. package/vendor/librocksdb/vendor/rocksdb/utilities/fault_injection_fs.h +325 -158
  236. package/vendor/librocksdb/vendor/rocksdb/utilities/option_change_migration/option_change_migration.cc +1 -8
  237. package/vendor/librocksdb/vendor/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.cc +144 -0
  238. package/vendor/librocksdb/vendor/rocksdb/utilities/table_properties_collectors/compact_for_tiering_collector.h +45 -0
  239. package/vendor/librocksdb/vendor/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc +12 -0
  240. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +1 -1
  241. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h +3 -3
  242. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction.cc +116 -20
  243. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction.h +33 -1
  244. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +78 -13
  245. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/pessimistic_transaction_db.h +33 -1
  246. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_base.cc +106 -7
  247. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_base.h +68 -10
  248. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_test.h +7 -3
  249. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_util.cc +8 -5
  250. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/transaction_util.h +7 -4
  251. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_prepared_txn.cc +18 -12
  252. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_prepared_txn_db.cc +4 -4
  253. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_prepared_txn_db.h +17 -0
  254. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_unprepared_txn.cc +11 -9
  255. package/vendor/librocksdb/vendor/rocksdb/utilities/transactions/write_unprepared_txn_db.cc +2 -1
  256. package/vendor/librocksdb/vendor/rocksdb/utilities/types_util.cc +88 -0
  257. package/vendor/librocksdb/vendor/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc +313 -14
  258. package/vendor/librocksdb/vendor/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +7 -0
  259. package/vendor/librocksdb/vendor/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h +1 -1
  260. package/vendor/librocksdb/vendor/rocksdb/db/multi_cf_iterator.cc +0 -102
  261. package/vendor/librocksdb/vendor/rocksdb/db/multi_cf_iterator.h +0 -159
@@ -250,8 +250,6 @@ void ErrorHandler::CancelErrorRecovery() {
250
250
  EndAutoRecovery();
251
251
  }
252
252
 
253
- STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
254
-
255
253
  // This is the main function for looking at an error during a background
256
254
  // operation and deciding the severity, and error recovery strategy. The high
257
255
  // level algorithm is as follows -
@@ -270,11 +268,11 @@ STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
270
268
  // This can also get called as part of a recovery operation. In that case, we
271
269
  // also track the error separately in recovery_error_ so we can tell in the
272
270
  // end whether recovery succeeded or not
273
- const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
274
- BackgroundErrorReason reason) {
271
+ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
272
+ BackgroundErrorReason reason) {
275
273
  db_mutex_->AssertHeld();
276
274
  if (bg_err.ok()) {
277
- return kOkStatus;
275
+ return;
278
276
  }
279
277
 
280
278
  ROCKS_LOG_INFO(db_options_.info_log,
@@ -339,7 +337,7 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
339
337
  } else {
340
338
  // This error is less severe than previously encountered error. Don't
341
339
  // take any further action
342
- return bg_error_;
340
+ return;
343
341
  }
344
342
  }
345
343
 
@@ -356,7 +354,6 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
356
354
  if (bg_error_.severity() >= Status::Severity::kHardError) {
357
355
  is_db_stopped_.store(true, std::memory_order_release);
358
356
  }
359
- return bg_error_;
360
357
  }
361
358
 
362
359
  // This is the main function for looking at IO related error during the
@@ -383,14 +380,14 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
383
380
  // 3) for other cases, HandleKnownErrors(const Status& bg_err,
384
381
  // BackgroundErrorReason reason) will be called to handle other error cases
385
382
  // such as delegating to SstFileManager to handle no space error.
386
- const Status& ErrorHandler::SetBGError(const Status& bg_status,
387
- BackgroundErrorReason reason) {
383
+ void ErrorHandler::SetBGError(const Status& bg_status,
384
+ BackgroundErrorReason reason) {
388
385
  db_mutex_->AssertHeld();
389
386
  Status tmp_status = bg_status;
390
387
  IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
391
388
 
392
389
  if (bg_io_err.ok()) {
393
- return kOkStatus;
390
+ return;
394
391
  }
395
392
  ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
396
393
  bg_io_err.ToString().c_str());
@@ -413,11 +410,11 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status,
413
410
  EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
414
411
  &bg_err, db_mutex_, &auto_recovery);
415
412
  recover_context_ = context;
416
- return bg_error_;
417
- } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
418
- (bg_io_err.GetScope() ==
419
- IOStatus::IOErrorScope::kIOErrorScopeFile ||
420
- bg_io_err.GetRetryable())) {
413
+ return;
414
+ }
415
+ if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
416
+ (bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile ||
417
+ bg_io_err.GetRetryable())) {
421
418
  // Second, check if the error is a retryable IO error (file scope IO error
422
419
  // is also treated as retryable IO error in RocksDB write path). if it is
423
420
  // retryable error and its severity is higher than bg_error_, overwrite the
@@ -426,10 +423,6 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status,
426
423
  // IO error as hard error. Note that, all the NoSpace error should be
427
424
  // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
428
425
  // it is retryable or file scope, this logic will be bypassed.
429
- bool auto_recovery = false;
430
- EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
431
- &new_bg_io_err, db_mutex_,
432
- &auto_recovery);
433
426
 
434
427
  RecordStats({ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT},
435
428
  {} /* int_histograms */);
@@ -445,9 +438,13 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status,
445
438
  ROCKS_LOG_INFO(
446
439
  db_options_.info_log,
447
440
  "ErrorHandler: Compaction will schedule by itself to resume\n");
441
+ bool auto_recovery = false;
442
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
443
+ &new_bg_io_err, db_mutex_,
444
+ &auto_recovery);
448
445
  // Not used in this code path.
449
446
  new_bg_io_err.PermitUncheckedError();
450
- return bg_error_;
447
+ return;
451
448
  }
452
449
 
453
450
  Status::Severity severity;
@@ -469,10 +466,14 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status,
469
466
  Status bg_err(new_bg_io_err, severity);
470
467
  CheckAndSetRecoveryAndBGError(bg_err);
471
468
  recover_context_ = context;
472
- return StartRecoverFromRetryableBGIOError(bg_io_err);
473
- } else {
474
- return HandleKnownErrors(new_bg_io_err, reason);
469
+ bool auto_recovery = db_options_.max_bgerror_resume_count > 0;
470
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
471
+ &new_bg_io_err, db_mutex_,
472
+ &auto_recovery);
473
+ StartRecoverFromRetryableBGIOError(bg_io_err);
474
+ return;
475
475
  }
476
+ HandleKnownErrors(new_bg_io_err, reason);
476
477
  }
477
478
 
478
479
  void ErrorHandler::AddFilesToQuarantine(
@@ -620,23 +621,23 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) {
620
621
  return s;
621
622
  }
622
623
 
623
- const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
624
+ void ErrorHandler::StartRecoverFromRetryableBGIOError(
624
625
  const IOStatus& io_error) {
625
626
  db_mutex_->AssertHeld();
626
- if (bg_error_.ok()) {
627
- return bg_error_;
628
- } else if (io_error.ok()) {
629
- return kOkStatus;
630
- } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
631
- // Auto resume BG error is not enabled, directly return bg_error_.
632
- return bg_error_;
633
- } else if (end_recovery_) {
627
+ if (bg_error_.ok() || io_error.ok()) {
628
+ return;
629
+ }
630
+ if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
631
+ // Auto resume BG error is not enabled
632
+ return;
633
+ }
634
+ if (end_recovery_) {
634
635
  // Can temporarily release db mutex
635
636
  EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
636
637
  Status::ShutdownInProgress(),
637
638
  db_mutex_);
638
639
  db_mutex_->AssertHeld();
639
- return bg_error_;
640
+ return;
640
641
  }
641
642
  RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */);
642
643
  ROCKS_LOG_INFO(
@@ -664,12 +665,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
664
665
 
665
666
  recovery_thread_.reset(
666
667
  new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
667
-
668
- if (recovery_error_.ok()) {
669
- return recovery_error_;
670
- } else {
671
- return bg_error_;
672
- }
673
668
  }
674
669
 
675
670
  // Automatic recover from Retryable BG IO error. Must be called after db
@@ -56,7 +56,7 @@ class ErrorHandler {
56
56
  Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
57
57
  Status::Code code, Status::SubCode subcode);
58
58
 
59
- const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
59
+ void SetBGError(const Status& bg_err, BackgroundErrorReason reason);
60
60
 
61
61
  Status GetBGError() const { return bg_error_; }
62
62
 
@@ -135,11 +135,10 @@ class ErrorHandler {
135
135
  // unsorted.
136
136
  autovector<uint64_t> files_to_quarantine_;
137
137
 
138
- const Status& HandleKnownErrors(const Status& bg_err,
139
- BackgroundErrorReason reason);
138
+ void HandleKnownErrors(const Status& bg_err, BackgroundErrorReason reason);
140
139
  Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
141
140
  void RecoverFromNoSpace();
142
- const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
141
+ void StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
143
142
  void RecoverFromRetryableBGIOError();
144
143
  // First, if it is in recovery and the recovery_error is ok. Set the
145
144
  // recovery_error_ to bg_err. Second, if the severity is higher than the
@@ -228,15 +228,18 @@ void EventHelpers::NotifyOnErrorRecoveryEnd(
228
228
  InstrumentedMutex* db_mutex) {
229
229
  if (!listeners.empty()) {
230
230
  db_mutex->AssertHeld();
231
+ // Make copies before releasing mutex to avoid race.
232
+ Status old_bg_error_cp = old_bg_error;
233
+ Status new_bg_error_cp = new_bg_error;
231
234
  // release lock while notifying events
232
235
  db_mutex->Unlock();
233
236
  TEST_SYNC_POINT("NotifyOnErrorRecoveryEnd:MutexUnlocked:1");
234
237
  TEST_SYNC_POINT("NotifyOnErrorRecoveryEnd:MutexUnlocked:2");
235
238
  for (auto& listener : listeners) {
236
239
  BackgroundErrorRecoveryInfo info;
237
- info.old_bg_error = old_bg_error;
238
- info.new_bg_error = new_bg_error;
239
- listener->OnErrorRecoveryCompleted(old_bg_error);
240
+ info.old_bg_error = old_bg_error_cp;
241
+ info.new_bg_error = new_bg_error_cp;
242
+ listener->OnErrorRecoveryCompleted(old_bg_error_cp);
240
243
  listener->OnErrorRecoveryEnd(info);
241
244
  info.old_bg_error.PermitUncheckedError();
242
245
  info.new_bg_error.PermitUncheckedError();
@@ -711,7 +711,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
711
711
  uint64_t /*file_size*/) override {
712
712
  // FIXME later: `key` might contain user timestamp. That should be
713
713
  // exposed properly in a future update to TablePropertiesCollector
714
- KeySegmentsExtractor::Result extracted;
714
+ extracted.Reset();
715
715
  if (extractor) {
716
716
  extractor->Extract(key, KeySegmentsExtractor::kFullUserKey, &extracted);
717
717
  if (UNLIKELY(extracted.category >=
@@ -750,7 +750,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
750
750
  }
751
751
  }
752
752
  prev_key.assign(key.data(), key.size());
753
- prev_extracted = std::move(extracted);
753
+ std::swap(prev_extracted, extracted);
754
754
  first_key = false;
755
755
  return Status::OK();
756
756
  }
@@ -859,6 +859,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
859
859
  std::vector<std::shared_ptr<SstQueryFilterBuilder>> builders;
860
860
  bool first_key = true;
861
861
  std::string prev_key;
862
+ KeySegmentsExtractor::Result extracted;
862
863
  KeySegmentsExtractor::Result prev_extracted;
863
864
  KeySegmentsExtractor::KeyCategorySet categories_seen;
864
865
  };
@@ -44,9 +44,12 @@ Status ExternalSstFileIngestionJob::Prepare(
44
44
  return status;
45
45
  }
46
46
 
47
+ // Files generated in another DB or CF may have a different column family
48
+ // ID, so we let it pass here.
47
49
  if (file_to_ingest.cf_id !=
48
50
  TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
49
- file_to_ingest.cf_id != cfd_->GetID()) {
51
+ file_to_ingest.cf_id != cfd_->GetID() &&
52
+ !ingestion_options_.allow_db_generated_files) {
50
53
  return Status::InvalidArgument(
51
54
  "External file column family id don't match");
52
55
  }
@@ -111,6 +114,7 @@ Status ExternalSstFileIngestionJob::Prepare(
111
114
  const std::string path_inside_db = TableFileName(
112
115
  cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
113
116
  if (ingestion_options_.move_files) {
117
+ assert(!ingestion_options_.allow_db_generated_files);
114
118
  status =
115
119
  fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
116
120
  if (status.ok()) {
@@ -342,8 +346,7 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
342
346
  autovector<UserKeyRange> ranges;
343
347
  ranges.reserve(n);
344
348
  for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
345
- ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
346
- file_to_ingest.largest_internal_key.user_key());
349
+ ranges.emplace_back(file_to_ingest.start_ukey, file_to_ingest.limit_ukey);
347
350
  }
348
351
  Status status = cfd_->RangesOverlapWithMemtables(
349
352
  ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
@@ -705,9 +708,16 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
705
708
  // Get table version
706
709
  auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
707
710
  if (version_iter == uprops.end()) {
708
- return Status::Corruption("External file version not found");
711
+ if (!ingestion_options_.allow_db_generated_files) {
712
+ return Status::Corruption("External file version not found");
713
+ } else {
714
+ // 0 is special version for when a file from live DB does not have the
715
+ // version table property
716
+ file_to_ingest->version = 0;
717
+ }
718
+ } else {
719
+ file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
709
720
  }
710
- file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
711
721
 
712
722
  auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
713
723
  if (file_to_ingest->version == 2) {
@@ -734,8 +744,15 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
734
744
  return Status::InvalidArgument(
735
745
  "External SST file V1 does not support global seqno");
736
746
  }
747
+ } else if (file_to_ingest->version == 0) {
748
+ // allow_db_generated_files is true
749
+ assert(seqno_iter == uprops.end());
750
+ file_to_ingest->original_seqno = 0;
751
+ file_to_ingest->global_seqno_offset = 0;
737
752
  } else {
738
- return Status::InvalidArgument("External file version is not supported");
753
+ return Status::InvalidArgument("External file version " +
754
+ std::to_string(file_to_ingest->version) +
755
+ " is not supported");
739
756
  }
740
757
 
741
758
  file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
@@ -897,6 +914,25 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
897
914
  } else if (!iter->status().ok()) {
898
915
  return iter->status();
899
916
  }
917
+ if (ingestion_options_.allow_db_generated_files) {
918
+ // Verify that all keys have seqno zero.
919
+ // TODO: store largest seqno in table property and validate it instead.
920
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
921
+ Status pik_status =
922
+ ParseInternalKey(iter->key(), &key, allow_data_in_errors);
923
+ if (!pik_status.ok()) {
924
+ return Status::Corruption("Corrupted key in external file. ",
925
+ pik_status.getState());
926
+ }
927
+ if (key.sequence != 0) {
928
+ return Status::NotSupported(
929
+ "External file has a key with non zero sequence number.");
930
+ }
931
+ }
932
+ if (!iter->status().ok()) {
933
+ return iter->status();
934
+ }
935
+ }
900
936
 
901
937
  std::unique_ptr<InternalIterator> range_del_iter(
902
938
  table_reader->NewRangeTombstoneIterator(ro));
@@ -912,6 +948,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
912
948
  return Status::Corruption("Corrupted key in external file. ",
913
949
  pik_status.getState());
914
950
  }
951
+ if (key.sequence != 0) {
952
+ return Status::Corruption(
953
+ "External file has a range deletion with non zero sequence "
954
+ "number.");
955
+ }
915
956
  RangeTombstone tombstone(key, range_del_iter->value());
916
957
 
917
958
  InternalKey start_key = tombstone.SerializeKey();
@@ -930,6 +971,17 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
930
971
  }
931
972
  }
932
973
 
974
+ const size_t ts_sz = ucmp->timestamp_size();
975
+ Slice smallest = file_to_ingest->smallest_internal_key.user_key();
976
+ Slice largest = file_to_ingest->largest_internal_key.user_key();
977
+ if (ts_sz > 0) {
978
+ AppendUserKeyWithMaxTimestamp(&file_to_ingest->start_ukey, smallest, ts_sz);
979
+ AppendUserKeyWithMinTimestamp(&file_to_ingest->limit_ukey, largest, ts_sz);
980
+ } else {
981
+ file_to_ingest->start_ukey.assign(smallest.data(), smallest.size());
982
+ file_to_ingest->limit_ukey.assign(largest.data(), largest.size());
983
+ }
984
+
933
985
  auto s =
934
986
  GetSstInternalUniqueId(file_to_ingest->table_properties.db_id,
935
987
  file_to_ingest->table_properties.db_session_id,
@@ -953,13 +1005,15 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
953
1005
  *assigned_seqno = 0;
954
1006
  auto ucmp = cfd_->user_comparator();
955
1007
  const size_t ts_sz = ucmp->timestamp_size();
956
- if (force_global_seqno || files_overlap_) {
1008
+ if (force_global_seqno || files_overlap_ ||
1009
+ compaction_style == kCompactionStyleFIFO) {
957
1010
  *assigned_seqno = last_seqno + 1;
958
1011
  // If files overlap, we have to ingest them at level 0.
959
- if (files_overlap_) {
1012
+ if (files_overlap_ || compaction_style == kCompactionStyleFIFO) {
960
1013
  assert(ts_sz == 0);
961
1014
  file_to_ingest->picked_level = 0;
962
- if (ingestion_options_.fail_if_not_bottommost_level) {
1015
+ if (ingestion_options_.fail_if_not_bottommost_level &&
1016
+ cfd_->NumberLevels() > 1) {
963
1017
  status = Status::TryAgain(
964
1018
  "Files cannot be ingested to Lmax. Please make sure key range of "
965
1019
  "Lmax does not overlap with files to ingest.");
@@ -980,9 +1034,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
980
1034
  if (lvl > 0 && lvl < vstorage->base_level()) {
981
1035
  continue;
982
1036
  }
983
- if (cfd_->RangeOverlapWithCompaction(
984
- file_to_ingest->smallest_internal_key.user_key(),
985
- file_to_ingest->largest_internal_key.user_key(), lvl)) {
1037
+ if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey,
1038
+ file_to_ingest->limit_ukey, lvl)) {
986
1039
  // We must use L0 or any level higher than `lvl` to be able to overwrite
987
1040
  // the compaction output keys that we overlap with in this level, We also
988
1041
  // need to assign this file a seqno to overwrite the compaction output
@@ -992,9 +1045,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
992
1045
  } else if (vstorage->NumLevelFiles(lvl) > 0) {
993
1046
  bool overlap_with_level = false;
994
1047
  status = sv->current->OverlapWithLevelIterator(
995
- ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
996
- file_to_ingest->largest_internal_key.user_key(), lvl,
997
- &overlap_with_level);
1048
+ ro, env_options_, file_to_ingest->start_ukey,
1049
+ file_to_ingest->limit_ukey, lvl, &overlap_with_level);
998
1050
  if (!status.ok()) {
999
1051
  return status;
1000
1052
  }
@@ -1035,11 +1087,18 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
1035
1087
  "Column family enables user-defined timestamps, please make sure the "
1036
1088
  "key range (without timestamp) of external file does not overlap "
1037
1089
  "with key range (without timestamp) in the db");
1090
+ return status;
1038
1091
  }
1039
1092
  if (*assigned_seqno == 0) {
1040
1093
  *assigned_seqno = last_seqno + 1;
1041
1094
  }
1042
1095
  }
1096
+
1097
+ if (ingestion_options_.allow_db_generated_files && *assigned_seqno != 0) {
1098
+ return Status::InvalidArgument(
1099
+ "An ingested file is assigned to a non-zero sequence number, which is "
1100
+ "incompatible with ingestion option allow_db_generated_files.");
1101
+ }
1043
1102
  return status;
1044
1103
  }
1045
1104
 
@@ -1163,9 +1222,8 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
1163
1222
  }
1164
1223
 
1165
1224
  auto* vstorage = cfd_->current()->storage_info();
1166
- Slice file_smallest_user_key(
1167
- file_to_ingest->smallest_internal_key.user_key());
1168
- Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
1225
+ Slice file_smallest_user_key(file_to_ingest->start_ukey);
1226
+ Slice file_largest_user_key(file_to_ingest->limit_ukey);
1169
1227
 
1170
1228
  if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
1171
1229
  &file_largest_user_key)) {
@@ -32,6 +32,17 @@ struct IngestedFileInfo {
32
32
  InternalKey smallest_internal_key;
33
33
  // Largest internal key in external file
34
34
  InternalKey largest_internal_key;
35
+ // NOTE: use below two fields for all `*Overlap*` types of checks instead of
36
+ // smallest_internal_key.user_key() and largest_internal_key.user_key().
37
+ // The smallest / largest user key contained in the file for key range checks.
38
+ // These could be different from smallest_internal_key.user_key(), and
39
+ // largest_internal_key.user_key() when user-defined timestamps are enabled,
40
+ // because the check is about making sure the user key without timestamps part
41
+ // does not overlap. To achieve that, the smallest user key will be updated
42
+ // with the maximum timestamp while the largest user key will be updated with
43
+ // the min timestamp. It's otherwise the same.
44
+ std::string start_ukey;
45
+ std::string limit_ukey;
35
46
  // Sequence number for keys in external file
36
47
  SequenceNumber original_seqno;
37
48
  // Offset of the global sequence number field in the file, will
@@ -115,6 +115,9 @@ FlushJob::FlushJob(
115
115
  db_mutex_(db_mutex),
116
116
  shutting_down_(shutting_down),
117
117
  existing_snapshots_(std::move(existing_snapshots)),
118
+ earliest_snapshot_(existing_snapshots_.empty()
119
+ ? kMaxSequenceNumber
120
+ : existing_snapshots_.at(0)),
118
121
  earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
119
122
  snapshot_checker_(snapshot_checker),
120
123
  job_context_(job_context),
@@ -194,6 +197,7 @@ void FlushJob::PickMemTable() {
194
197
  // Track effective cutoff user-defined timestamp during flush if
195
198
  // user-defined timestamps can be stripped.
196
199
  GetEffectiveCutoffUDTForPickedMemTables();
200
+ GetPrecludeLastLevelMinSeqno();
197
201
 
198
202
  ReportFlushInputSize(mems_);
199
203
 
@@ -231,7 +235,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
231
235
 
232
236
  AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
233
237
  if (mems_.empty()) {
234
- ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
238
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] No memtable to flush",
235
239
  cfd_->GetName().c_str());
236
240
  return Status::OK();
237
241
  }
@@ -502,7 +506,7 @@ Status FlushJob::MemPurge() {
502
506
  const std::atomic<bool> kManualCompactionCanceledFalse{false};
503
507
  CompactionIterator c_iter(
504
508
  iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
505
- kMaxSequenceNumber, &existing_snapshots_,
509
+ kMaxSequenceNumber, &existing_snapshots_, earliest_snapshot_,
506
510
  earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
507
511
  env, ShouldReportDetailedTime(env, ioptions->stats),
508
512
  true /* internal key corruption is not ok */, range_del_agg.get(),
@@ -968,14 +972,17 @@ Status FlushJob::WriteLevel0Table() {
968
972
  cfd_->GetID(), cfd_->GetName(), 0 /* level */,
969
973
  false /* is_bottommost */, TableFileCreationReason::kFlush,
970
974
  oldest_key_time, current_time, db_id_, db_session_id_,
971
- 0 /* target_file_size */, meta_.fd.GetNumber());
975
+ 0 /* target_file_size */, meta_.fd.GetNumber(),
976
+ preclude_last_level_min_seqno_ == kMaxSequenceNumber
977
+ ? preclude_last_level_min_seqno_
978
+ : std::min(earliest_snapshot_, preclude_last_level_min_seqno_));
972
979
  const SequenceNumber job_snapshot_seq =
973
980
  job_context_->GetJobSnapshotSequence();
974
981
 
975
982
  s = BuildTable(
976
983
  dbname_, versions_, db_options_, tboptions, file_options_,
977
984
  cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
978
- &blob_file_additions, existing_snapshots_,
985
+ &blob_file_additions, existing_snapshots_, earliest_snapshot_,
979
986
  earliest_write_conflict_snapshot_, job_snapshot_seq,
980
987
  snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
981
988
  cfd_->internal_stats(), &io_s, io_tracer_,
@@ -1010,10 +1017,15 @@ Status FlushJob::WriteLevel0Table() {
1010
1017
  ROCKS_LOG_BUFFER(log_buffer_,
1011
1018
  "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
1012
1019
  " bytes %s"
1013
- "%s",
1020
+ " %s"
1021
+ " %s",
1014
1022
  cfd_->GetName().c_str(), job_context_->job_id,
1015
1023
  meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
1016
1024
  s.ToString().c_str(),
1025
+ s.ok() && meta_.fd.GetFileSize() == 0
1026
+ ? "It's an empty SST file from a successful flush so "
1027
+ "won't be kept in the DB"
1028
+ : "",
1017
1029
  meta_.marked_for_compaction ? " (needs compaction)" : "");
1018
1030
 
1019
1031
  if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
@@ -1154,6 +1166,26 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() {
1154
1166
  }
1155
1167
  }
1156
1168
 
1169
+ void FlushJob::GetPrecludeLastLevelMinSeqno() {
1170
+ if (cfd_->ioptions()->preclude_last_level_data_seconds == 0) {
1171
+ return;
1172
+ }
1173
+ int64_t current_time = 0;
1174
+ Status s = db_options_.clock->GetCurrentTime(&current_time);
1175
+ if (!s.ok()) {
1176
+ ROCKS_LOG_WARN(db_options_.info_log,
1177
+ "Failed to get current time in Flush: Status: %s",
1178
+ s.ToString().c_str());
1179
+ } else {
1180
+ SequenceNumber preserve_time_min_seqno;
1181
+ seqno_to_time_mapping_->GetCurrentTieringCutoffSeqnos(
1182
+ static_cast<uint64_t>(current_time),
1183
+ cfd_->ioptions()->preserve_internal_time_seconds,
1184
+ cfd_->ioptions()->preclude_last_level_data_seconds,
1185
+ &preserve_time_min_seqno, &preclude_last_level_min_seqno_);
1186
+ }
1187
+ }
1188
+
1157
1189
  Status FlushJob::MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT() {
1158
1190
  db_mutex_->AssertHeld();
1159
1191
  const auto* ucmp = cfd_->user_comparator();
@@ -143,6 +143,13 @@ class FlushJob {
143
143
  // `MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT` for details.
144
144
  void GetEffectiveCutoffUDTForPickedMemTables();
145
145
 
146
+ // If this column family enables tiering feature, it will find the current
147
+ // `preclude_last_level_min_seqno_`, and the smaller one between this and
148
+ // the `earliset_snapshot_` will later be announced to user property
149
+ // collectors. It indicates to tiering use cases which data are old enough to
150
+ // be placed on the last level.
151
+ void GetPrecludeLastLevelMinSeqno();
152
+
146
153
  Status MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT();
147
154
 
148
155
  const std::string& dbname_;
@@ -161,6 +168,7 @@ class FlushJob {
161
168
  InstrumentedMutex* db_mutex_;
162
169
  std::atomic<bool>* shutting_down_;
163
170
  std::vector<SequenceNumber> existing_snapshots_;
171
+ SequenceNumber earliest_snapshot_;
164
172
  SequenceNumber earliest_write_conflict_snapshot_;
165
173
  SnapshotChecker* snapshot_checker_;
166
174
  JobContext* job_context_;
@@ -221,6 +229,12 @@ class FlushJob {
221
229
  // Keeps track of the newest user-defined timestamp for this flush job if
222
230
  // `persist_user_defined_timestamps` flag is false.
223
231
  std::string cutoff_udt_;
232
+
233
+ // The current minimum seqno that compaction jobs will preclude the data from
234
+ // the last level. Data with seqnos larger than this or larger than
235
+ // `earliest_snapshot_` will be output to the penultimate level had it gone
236
+ // through a compaction to the last level.
237
+ SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
224
238
  };
225
239
 
226
240
  } // namespace ROCKSDB_NAMESPACE